diff --git a/src/include/mol_descriptors.hpp b/src/include/mol_descriptors.hpp index 520fe09..64fd4e5 100644 --- a/src/include/mol_descriptors.hpp +++ b/src/include/mol_descriptors.hpp @@ -1,5 +1,8 @@ #pragma once #include "common.hpp" +#include "duckdb/main/connection.hpp" + namespace duckdb_rdkit { void RegisterDescriptorFunctions(DatabaseInstance &instance); +void mol_registration_hash(DataChunk &args, ExpressionState &state, Vector &result); } diff --git a/src/mol_descriptors.cpp b/src/mol_descriptors.cpp index aef4947..166e6a8 100644 --- a/src/mol_descriptors.cpp +++ b/src/mol_descriptors.cpp @@ -13,6 +13,8 @@ #include "qed.hpp" #include "types.hpp" #include "umbra_mol.hpp" +#include +#include namespace duckdb_rdkit { @@ -131,6 +133,26 @@ void mol_num_rotatable_bonds(DataChunk &args, ExpressionState &state, Vector &re }); } +void mol_registration_hash(DataChunk &args, ExpressionState &state, Vector &result) { + D_ASSERT(args.data.size() == 1); + auto &binary_umbra_mol = args.data[0]; + auto count = args.size(); + + UnaryExecutor::Execute( + binary_umbra_mol, result, count, [&](string_t b_umbra_mol) { + auto umbra_mol = umbra_mol_t(b_umbra_mol); + auto bmol = umbra_mol.GetBinaryMol(); + auto mol = rdkit_binary_mol_to_mol(bmol); + + std::stringstream hash_stream; + hash_stream << "tautomer_smiles:" << RDKit::MolHash::MolHash(mol, RDKit::MolHash::HashFunction::HetAtomTautomer) << "\n"; + hash_stream << "canonical_smiles:" << RDKit::MolHash::MolHash(mol, RDKit::MolHash::HashFunction::CanonicalSmiles) << "\n"; + hash_stream << "mol_formula:" << RDKit::MolHash::MolHash(mol, RDKit::MolHash::HashFunction::MolFormula); + + return StringVector::AddString(result, hash_stream.str()); + }); +} + void RegisterDescriptorFunctions(DatabaseInstance &instance) { ScalarFunctionSet set_mol_amw("mol_amw"); set_mol_amw.AddFunction( @@ -171,5 +193,10 @@ void RegisterDescriptorFunctions(DatabaseInstance &instance) { set_mol_num_rotatable_bonds.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::INTEGER, mol_num_rotatable_bonds)); ExtensionUtil::RegisterFunction(instance, set_mol_num_rotatable_bonds); + + ScalarFunctionSet set_mol_registration_hash("mol_registration_hash"); + set_mol_registration_hash.AddFunction( + ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::VARCHAR, mol_registration_hash)); + ExtensionUtil::RegisterFunction(instance, set_mol_registration_hash); } } // namespace duckdb_rdkit diff --git a/test/sql/mol_registration_hash.test b/test/sql/mol_registration_hash.test new file mode 100644 index 0000000..a29607a --- /dev/null +++ b/test/sql/mol_registration_hash.test @@ -0,0 +1,27 @@ +# Require statement will ensure this test is run with this extension loaded +require duckdb_rdkit + +statement ok +CREATE TABLE molecules (m Mol, registration_hash VARCHAR); + +statement ok +INSERT INTO molecules VALUES + (mol_from_smiles('CCO'), null), + (mol_from_smiles('C1=CC=CC=C1'), null); + +statement ok +UPDATE molecules SET registration_hash=mol_registration_hash(m); + +query I +SELECT registration_hash FROM molecules WHERE m = mol_from_smiles('CCO'); +---- +tautomer_smiles:CCO +canonical_smiles:CCO +mol_formula:C2H6O + +query I +SELECT registration_hash FROM molecules WHERE m = mol_from_smiles('C1=CC=CC=C1'); +---- +tautomer_smiles:c1ccccc1 +canonical_smiles:c1ccccc1 +mol_formula:C6H6