EASMS-data-processing/ExtractingFingerprints.py at main · StructuralGenomicsConsortium/EASMS-data-processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from fingerprints import HitGenMACCS, HitGenECFP4, HitGenECFP6, HitGenFCFP4, HitGenFCFP6, HitGenRDK, HitGenAvalon, HitGenTopTor, HitGenAtomPair

def compute_molecular_properties(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mw = Descriptors.MolWt(mol)
        alogp = Descriptors.MolLogP(mol)
    else:
        mw = np.nan
        alogp = np.nan
    return mw, alogp

def generate_fingerprints(smiles, fps_dict):
    fp_data = {}
    for fp_name, fp_class in fps_dict.items():
        try:
            fp_array = fp_class.generate_fps(smis=[smiles]).flatten()
            fp_data[fp_name] = ','.join(map(str, fp_array))
        except Exception:
            fp_data[fp_name] = ','.join(['nan'] * fp_class._dimension)
    return fp_data

def process_file(input_file, output_file, fingerprints, nrows=None):
    # Read the file
    df = pd.read_csv(input_file, nrows=nrows)

    '''# Add MW and ALOGP columns
    df[['MW', 'ALOGP']] = df['SMILES (Compounds)'].apply(
        lambda smi: pd.Series(compute_molecular_properties(smi))
    )'''

    # Generate fingerprint columns
    fingerprint_data = []
    for smiles in df['smiles']:
        fps = generate_fingerprints(smiles, fingerprints)
        fingerprint_data.append(fps)

    # Create a DataFrame from fingerprint data
    fingerprint_df = pd.DataFrame(fingerprint_data)

    # Concatenate fingerprint data with the main DataFrame
    df = pd.concat([df, fingerprint_df], axis=1)

    # Save the new DataFrame to a CSV file
    df.to_csv(output_file, index=False)

    print(f"The updated file with fingerprints has been saved as '{output_file}'")

def main():
    # Define fingerprint classes
    # You can also extract the binary versions (look at the fingerprints.py )
    fingerprint_classes = {
        'ECFP4': HitGenECFP4(),
        'ECFP6': HitGenECFP6(),
        'FCFP4': HitGenFCFP4(),
        'FCFP6': HitGenFCFP6(),
        'MACCS': HitGenMACCS(),
        'RDK': HitGenRDK(),
        'AVALON': HitGenAvalon(),
        'TOPTOR': HitGenTopTor(),
        'ATOMPAIR': HitGenAtomPair()
    }

    nrows = None
    input_file = r"D:\0000-UHN\03-DataAndCodes\AIRCHECK-workflow\SimpleML\Bootcamp\Data\21Feb\ASMS_hits_clustered.csv"
    output_file = r"D:\0000-UHN\03-DataAndCodes\AIRCHECK-workflow\SimpleML\Bootcamp\Data\21Feb\ASMS_hits_clustered_with_fingerprints.csv"
    process_file(input_file, output_file, fingerprint_classes, nrows=nrows)

    '''input_file = "James_hits.csv"
    output_file = "James_hits_fingerprints.csv"
    process_file(input_file, output_file, fingerprint_classes, nrows=nrows)
    input_file = "ASMS_hits.csv"
    output_file = "ASMS_hits_fingerprints.csv"
    process_file(input_file, output_file, fingerprint_classes, nrows=nrows)'''

    '''
    fingerprint_classes = {
        'ECFP6': HitGenECFP4()
    }
    nrows = None # Use None to process the entire file
    # Example usage
    input_file = "ASMS_460K.csv"
    output_file = "ASMS_460K_with_ECFP6.csv"
    process_file(input_file, output_file, fingerprint_classes, nrows=nrows)
    '''


if __name__ == "__main__":
    main()