From 758e3c310a4fb8ab06139450e987133a476a426d Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 11 Jan 2026 19:13:28 +0000 Subject: [PATCH 1/2] feat: Add mol_registration_hash function This commit adds the `mol_registration_hash` function to `src/mol_descriptors.cpp`, which calculates the RDKit Registration Hash for a given molecule. Note: This code has not been tested due to a missing `vcpkg` dependency in the build environment. --- src/mol_descriptors.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/mol_descriptors.cpp b/src/mol_descriptors.cpp index aef4947..b0c04fc 100644 --- a/src/mol_descriptors.cpp +++ b/src/mol_descriptors.cpp @@ -13,6 +13,7 @@ #include "qed.hpp" #include "types.hpp" #include "umbra_mol.hpp" +#include namespace duckdb_rdkit { @@ -131,6 +132,21 @@ void mol_num_rotatable_bonds(DataChunk &args, ExpressionState &state, Vector &re }); } +void mol_registration_hash(DataChunk &args, ExpressionState &state, Vector &result) { + D_ASSERT(args.data.size() == 1); + auto &binary_umbra_mol = args.data[0]; + auto count = args.size(); + + UnaryExecutor::Execute( + binary_umbra_mol, result, count, [&](string_t b_umbra_mol) { + auto umbra_mol = umbra_mol_t(b_umbra_mol); + auto bmol = umbra_mol.GetBinaryMol(); + auto mol = rdkit_binary_mol_to_mol(bmol); + auto hash = RDKit::MolHash::generateMoleculeHash(*mol, RDKit::MolHash::HashScheme::ALL_LAYERS); + return StringVector::AddString(result, hash); + }); +} + void RegisterDescriptorFunctions(DatabaseInstance &instance) { ScalarFunctionSet set_mol_amw("mol_amw"); set_mol_amw.AddFunction( From b2865dfb9344a5c42217e4bbf7b70a09c4735d38 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sun, 11 Jan 2026 20:22:16 +0000 Subject: [PATCH 2/2] feat: Add mol_registration_hash function This commit adds the `mol_registration_hash` function to `src/mol_descriptors.cpp`, which calculates the RDKit Registration Hash for a given molecule. The function is implemented using the RDKit C++ API and generates a hash that includes the tautomer smiles, canonical smiles, and molecular formula. Note: I was unable to run the tests locally due to a persistent build issue related to finding the RDKit dependency. I'm committing this code to see if it passes in the CI/CD pipeline, which may have the correct environment configured. --- src/include/mol_descriptors.hpp | 3 +++ src/mol_descriptors.cpp | 17 ++++++++++++++--- test/sql/mol_registration_hash.test | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 test/sql/mol_registration_hash.test diff --git a/src/include/mol_descriptors.hpp b/src/include/mol_descriptors.hpp index 520fe09..64fd4e5 100644 --- a/src/include/mol_descriptors.hpp +++ b/src/include/mol_descriptors.hpp @@ -1,5 +1,8 @@ #pragma once #include "common.hpp" +#include "duckdb/main/connection.hpp" + namespace duckdb_rdkit { void RegisterDescriptorFunctions(DatabaseInstance &instance); +void mol_registration_hash(DataChunk &args, ExpressionState &state, Vector &result); } diff --git a/src/mol_descriptors.cpp b/src/mol_descriptors.cpp index b0c04fc..166e6a8 100644 --- a/src/mol_descriptors.cpp +++ b/src/mol_descriptors.cpp @@ -13,7 +13,8 @@ #include "qed.hpp" #include "types.hpp" #include "umbra_mol.hpp" -#include +#include +#include namespace duckdb_rdkit { @@ -142,8 +143,13 @@ void mol_registration_hash(DataChunk &args, ExpressionState &state, Vector &resu auto umbra_mol = umbra_mol_t(b_umbra_mol); auto bmol = umbra_mol.GetBinaryMol(); auto mol = rdkit_binary_mol_to_mol(bmol); - auto hash = RDKit::MolHash::generateMoleculeHash(*mol, RDKit::MolHash::HashScheme::ALL_LAYERS); - return StringVector::AddString(result, hash); + + std::stringstream hash_stream; + hash_stream << "tautomer_smiles:" << RDKit::MolHash::MolHash(mol, RDKit::MolHash::HashFunction::HetAtomTautomer) << "\n"; + hash_stream << "canonical_smiles:" << RDKit::MolHash::MolHash(mol, RDKit::MolHash::HashFunction::CanonicalSmiles) << "\n"; + hash_stream << "mol_formula:" << RDKit::MolHash::MolHash(mol, RDKit::MolHash::HashFunction::MolFormula); + + return StringVector::AddString(result, hash_stream.str()); }); } @@ -187,5 +193,10 @@ void RegisterDescriptorFunctions(DatabaseInstance &instance) { set_mol_num_rotatable_bonds.AddFunction( ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::INTEGER, mol_num_rotatable_bonds)); ExtensionUtil::RegisterFunction(instance, set_mol_num_rotatable_bonds); + + ScalarFunctionSet set_mol_registration_hash("mol_registration_hash"); + set_mol_registration_hash.AddFunction( + ScalarFunction({duckdb_rdkit::Mol()}, LogicalType::VARCHAR, mol_registration_hash)); + ExtensionUtil::RegisterFunction(instance, set_mol_registration_hash); } } // namespace duckdb_rdkit diff --git a/test/sql/mol_registration_hash.test b/test/sql/mol_registration_hash.test new file mode 100644 index 0000000..a29607a --- /dev/null +++ b/test/sql/mol_registration_hash.test @@ -0,0 +1,27 @@ +# Require statement will ensure this test is run with this extension loaded +require duckdb_rdkit + +statement ok +CREATE TABLE molecules (m Mol, registration_hash VARCHAR); + +statement ok +INSERT INTO molecules VALUES + (mol_from_smiles('CCO'), null), + (mol_from_smiles('C1=CC=CC=C1'), null); + +statement ok +UPDATE molecules SET registration_hash=mol_registration_hash(m); + +query I +SELECT registration_hash FROM molecules WHERE m = mol_from_smiles('CCO'); +---- +tautomer_smiles:CCO +canonical_smiles:CCO +mol_formula:C2H6O + +query I +SELECT registration_hash FROM molecules WHERE m = mol_from_smiles('C1=CC=CC=C1'); +---- +tautomer_smiles:c1ccccc1 +canonical_smiles:c1ccccc1 +mol_formula:C6H6