diff --git a/CHANGELOG.md b/CHANGELOG.md index aa70955..66eb1af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased] +### Added + +- `mol_to_rdkit_mol` to convert the duckdb_rdkit molecule into a format compatible + with RDKit + ## [0.3.0] - 2025-01-24 ### Added diff --git a/README.md b/README.md index 97eb108..87c5e87 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ This repository is based on https://github.com/duckdb/extension-template, check --- -This extension, duckdb_rdkit, allows you to use RDKit functionality within DuckDB. +This extension, duckdb_rdkit, integrates RDKit into DuckDB to enable you to do +cheminformatics work with DuckDB. ## Currently supported functionality: @@ -16,7 +17,8 @@ This extension, duckdb_rdkit, allows you to use RDKit functionality within DuckD > [!IMPORTANT] > The duckdb_rdkit molecule representation has additional metadata and cannot -> be read directly by RDKit. You will get an error. +> be read directly by RDKit. You will get an error. You can use `mol_to_rdkit_mol` +> to convert the duckdb_rdkit molecule representation into one that is RDKit compatible. - Currently, can only be created from a SMILES in a variety of ways: inserting a valid SMILES string into a column that expects Mol, type conversion such as 'CC'::mol, or the mol_from_smiles function. @@ -66,6 +68,11 @@ This extension, duckdb_rdkit, allows you to use RDKit functionality within DuckD - `mol_from_smiles(SMILES)`: returns a molecule for a SMILES string. Returns NULL if mol cannot be made from SMILES - `mol_to_smiles(mol)`: returns the SMILES string for a RDKit molecule +- `mol_to_rdkit_mol(mol)`: returns the binary RDKit molecule in hexadecimal representation + - duckdb_rdkit has its own binary representation of molecules, which differs from RDKit’s format. + Use this function to extract a molecule from duckdb_rdkit and convert it + into a format compatible with RDKit. The returned value can be passed + to RDKit's `Chem.Mol` function for further processing in Python. ### Molecule descriptors @@ -110,13 +117,15 @@ The easiest way to install RDKit is with conda, and I used [miniforge](https://g After installing conda, you can create a new conda environment and then install the packages needed. -As of August 2024, I found installing these packages worked (librdkit-dev seems to have the relevant header files). +`linux_conda_env.yml` or `osx_conda_env.yml` can be used to create a conda +environment for building the extension. ```shell # activate your conda env and then in your conda env run: conda create -n rdkit_dev conda activate rdkit_dev -conda install -c conda-forge -y boost-cpp boost cmake rdkit eigen librdkit-dev +# or use the osx_conda_env.yml if you are on osx +conda env update -f linux_conda_env.yml ``` After installing the prerequisite software, you can run: diff --git a/src/mol_formats.cpp b/src/mol_formats.cpp index ce7d26d..f4a313b 100644 --- a/src/mol_formats.cpp +++ b/src/mol_formats.cpp @@ -2,7 +2,9 @@ #include "common.hpp" #include "duckdb/common/string_util.hpp" #include "duckdb/common/types.hpp" +#include "duckdb/execution/expression_executor_state.hpp" #include "duckdb/function/function_set.hpp" +#include "duckdb/main/extension_util.hpp" #include "types.hpp" #include "umbra_mol.hpp" #include @@ -100,6 +102,18 @@ void mol_from_smiles(DataChunk &args, ExpressionState &state, Vector &result) { }); } +void mol_to_rdkit_mol(DataChunk &args, ExpressionState &state, Vector &result) { + D_ASSERT(args.data.size() == 1); + auto &umbra_mol = args.data[0]; + auto count = args.size(); + + UnaryExecutor::ExecuteWithNulls( + umbra_mol, result, count, + [&](umbra_mol_t umbra_mol, ValidityMask &mask, idx_t idx) { + return StringVector::AddStringOrBlob(result, umbra_mol.GetBinaryMol()); + }); +} + void RegisterFormatFunctions(DatabaseInstance &instance) { // Register scalar functions ScalarFunctionSet mol_from_smiles_set("mol_from_smiles"); @@ -111,6 +125,11 @@ void RegisterFormatFunctions(DatabaseInstance &instance) { mol_to_smiles_set.AddFunction( ScalarFunction({Mol()}, LogicalType::VARCHAR, mol_to_smiles)); ExtensionUtil::RegisterFunction(instance, mol_to_smiles_set); + + ScalarFunctionSet mol_to_rdkit_mol_set("mol_to_rdkit_mol"); + mol_to_rdkit_mol_set.AddFunction( + ScalarFunction({Mol()}, LogicalType::BLOB, mol_to_rdkit_mol)); + ExtensionUtil::RegisterFunction(instance, mol_to_rdkit_mol_set); } } // namespace duckdb_rdkit diff --git a/test/sql/mol_conversion.test b/test/sql/mol_conversion.test index b60433c..b61fd63 100644 --- a/test/sql/mol_conversion.test +++ b/test/sql/mol_conversion.test @@ -33,7 +33,7 @@ SELECT mol_from_smiles('NOTASMILES'); NULL -# mol_to_smiles can convert a proper RDKit pickled molecule back to the SMILES +# mol_to_smiles can convert a binary molecule back to the SMILES query I SELECT mol_to_smiles(mol_from_smiles('C1=CC=CC=C1')) ---- @@ -45,4 +45,10 @@ SELECT 'C1=CC=CC=C1'::mol; ---- c1ccccc1 +# mol_to_rdkit_mol can convert the internal representation of a molecule in +# duckdb (umbra_mol) to the RDKit molecule. This will output hex representation +query I +SELECT mol_to_rdkit_mol('C'::mol); +---- +\xEF\xBE\xAD\xDE\x00\x00\x00\x00\x10\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x80\x01\x06\x00@\x00\x00\x00\x04\x0BB\x00\x00\x00\x00\x17\x04\x00\x00\x00\x00\x00\x00\x00\x16