EASMS-data-processing/fingerprint_extraction.py at main · StructuralGenomicsConsortium/EASMS-data-processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""
Created on Sun Mar  2 17:29:20 2025

@author: shagh
"""

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from fingerprints import HitGenMACCS, HitGenECFP4, HitGenECFP6, HitGenFCFP4, HitGenFCFP6, HitGenRDK, HitGenAvalon, HitGenTopTor, HitGenAtomPair

def compute_molecular_properties(smiles):
    """
    Computes molecular properties (MW, ALOGP) for a given SMILES.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mw = Descriptors.MolWt(mol)
        alogp = Descriptors.MolLogP(mol)
    else:
        mw = np.nan
        alogp = np.nan
    return mw, alogp

def generate_fingerprints(smiles, fps_dict):
    """
    Generates fingerprints for a given SMILES string.

    Args:
        smiles (str): The input SMILES.
        fps_dict (dict): Dictionary of fingerprint classes.

    Returns:
        dict: Dictionary with fingerprint names as keys and fingerprint data as values.
    """
    fp_data = {}
    for fp_name, fp_class in fps_dict.items():
        try:
            fp_array = fp_class.generate_fps(smis=[smiles]).flatten()
            fp_data[fp_name] = ','.join(map(str, fp_array))
        except Exception:
            fp_data[fp_name] = ','.join(['nan'] * fp_class._dimension)  # Handle errors gracefully
    return fp_data

def extract_fingerprints(df):
    """
    Extracts molecular fingerprints and molecular properties for a given DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing a "SMILES" column.

    Returns:
        pd.DataFrame: Updated DataFrame with fingerprint features and molecular properties.
    """

    # Ensure the 'SMILES' column exists
    if "SMILES" not in df.columns:
        raise ValueError("Input DataFrame must contain a 'SMILES' column")

    # Define fingerprint classes
    fingerprint_classes = {
        'ECFP4': HitGenECFP4(),
        'ECFP6': HitGenECFP6(),
        'FCFP4': HitGenFCFP4(),
        'FCFP6': HitGenFCFP6(),
        'MACCS': HitGenMACCS(),
        'RDK': HitGenRDK(),
        'AVALON': HitGenAvalon(),
        'TOPTOR': HitGenTopTor(),
        'ATOMPAIR': HitGenAtomPair()
    }

    # Compute fingerprints and molecular properties
    fingerprint_data = []
    molecular_props = []

    for smiles in df["SMILES"]:
        fps = generate_fingerprints(smiles, fingerprint_classes)
        fingerprint_data.append(fps)
        mw, alogp = compute_molecular_properties(smiles)
        molecular_props.append({"MW": mw, "ALOGP": alogp})

    # Convert lists to DataFrames
    fingerprint_df = pd.DataFrame(fingerprint_data)
    molecular_props_df = pd.DataFrame(molecular_props)

    # Concatenate the original DataFrame with fingerprints and molecular properties
    df = pd.concat([df, molecular_props_df, fingerprint_df], axis=1)

    return df