-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfingerprint_extraction.py
More file actions
92 lines (76 loc) · 2.85 KB
/
fingerprint_extraction.py
File metadata and controls
92 lines (76 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 2 17:29:20 2025
@author: shagh
"""
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from fingerprints import HitGenMACCS, HitGenECFP4, HitGenECFP6, HitGenFCFP4, HitGenFCFP6, HitGenRDK, HitGenAvalon, HitGenTopTor, HitGenAtomPair
def compute_molecular_properties(smiles):
"""
Computes molecular properties (MW, ALOGP) for a given SMILES.
"""
mol = Chem.MolFromSmiles(smiles)
if mol:
mw = Descriptors.MolWt(mol)
alogp = Descriptors.MolLogP(mol)
else:
mw = np.nan
alogp = np.nan
return mw, alogp
def generate_fingerprints(smiles, fps_dict):
"""
Generates fingerprints for a given SMILES string.
Args:
smiles (str): The input SMILES.
fps_dict (dict): Dictionary of fingerprint classes.
Returns:
dict: Dictionary with fingerprint names as keys and fingerprint data as values.
"""
fp_data = {}
for fp_name, fp_class in fps_dict.items():
try:
fp_array = fp_class.generate_fps(smis=[smiles]).flatten()
fp_data[fp_name] = ','.join(map(str, fp_array))
except Exception:
fp_data[fp_name] = ','.join(['nan'] * fp_class._dimension) # Handle errors gracefully
return fp_data
def extract_fingerprints(df):
"""
Extracts molecular fingerprints and molecular properties for a given DataFrame.
Args:
df (pd.DataFrame): Input DataFrame containing a "SMILES" column.
Returns:
pd.DataFrame: Updated DataFrame with fingerprint features and molecular properties.
"""
# Ensure the 'SMILES' column exists
if "SMILES" not in df.columns:
raise ValueError("Input DataFrame must contain a 'SMILES' column")
# Define fingerprint classes
fingerprint_classes = {
'ECFP4': HitGenECFP4(),
'ECFP6': HitGenECFP6(),
'FCFP4': HitGenFCFP4(),
'FCFP6': HitGenFCFP6(),
'MACCS': HitGenMACCS(),
'RDK': HitGenRDK(),
'AVALON': HitGenAvalon(),
'TOPTOR': HitGenTopTor(),
'ATOMPAIR': HitGenAtomPair()
}
# Compute fingerprints and molecular properties
fingerprint_data = []
molecular_props = []
for smiles in df["SMILES"]:
fps = generate_fingerprints(smiles, fingerprint_classes)
fingerprint_data.append(fps)
mw, alogp = compute_molecular_properties(smiles)
molecular_props.append({"MW": mw, "ALOGP": alogp})
# Convert lists to DataFrames
fingerprint_df = pd.DataFrame(fingerprint_data)
molecular_props_df = pd.DataFrame(molecular_props)
# Concatenate the original DataFrame with fingerprints and molecular properties
df = pd.concat([df, molecular_props_df, fingerprint_df], axis=1)
return df