-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCreate_ProtacKB.py
125 lines (93 loc) · 3.71 KB
/
Create_ProtacKB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import getopt
import os
import random
import sys
import pandas as pd
from py2neo import Node, Relationship
from py2neo.database import Transaction
from tqdm import tqdm
from connection import populate_db
from constants import ENCODING, DATA_DIR
def add_nodes(tx: Transaction, protacdb_df: pd.DataFrame):
node_dict = {
'Protein': {},
'E3': {}
}
for target, smiles, doi in tqdm(protacdb_df[["Target", "Smiles", "Article DOI"]].values, total=protacdb_df.shape[0]):
if target in node_dict["Protein"]:
continue
protein_prop = {}
if pd.notna(target):
protein_prop["target name"] = target
if pd.notna(smiles):
protein_prop["smiles"] = smiles
if pd.notna(doi):
protein_prop["article reference"] = doi
node_dict["Protein"][target] = Node("Protein", **protein_prop)
tx.create(node_dict["Protein"][target])
for ligase, mf, inchi in tqdm(protacdb_df[["E3 ligase", "Molecular Formula", "InChI"]].values, total=protacdb_df.shape[0]):
if ligase in node_dict["E3"]:
continue
ligase_prop = {}
if pd.notna(ligase):
ligase_prop["E3 ligase"] = ligase
if pd.notna(mf):
ligase_prop["mol formula"] = mf
if pd.notna(inchi):
ligase_prop["InChi"] = inchi
node_dict["E3"][ligase] = Node("E3", **ligase_prop)
tx.create(node_dict["E3"][ligase])
#print(len(node_dict['Protein']))
#print(len(node_dict['E3']))
return node_dict
def main():
tx = populate_db("proxitest")
#tx.deleteall('proxitest')
protacdb_df = pd.read_csv(
os.path.join(DATA_DIR, "protacDB.csv"),
usecols=[
"Compound ID", "Uniprot", "Target", "E3 ligase", "Name", "Smiles",
"DC50 (nM)", "Dmax (%)", "Assay (DC50/Dmax)", "Percent degradation (%)",
"Assay (Percent degradation)", "Article DOI", "Molecular Weight",
"Exact Mass", "logP", "logS", "Heavy Atom Count", "Ring Count",
"Hydrogen Bond Acceptor Count", "Hydrogen Bond Donor Count",
"Rotatable Bond Count", "Topological Polar Surface Area", "Molecular Formula",
"InChI", "InChI Key"
],
dtype=str,
encoding=ENCODING
)
# pd.set_option("display.max_columns", None)
nodes_dict = add_nodes(tx=tx, protacdb_df=protacdb_df)
tx.commit()
test = pd.read_csv(
os.path.join(DATA_DIR, "protacDB.csv"),
usecols=[
"Compound ID", "Uniprot", "Target", "E3 ligase", "Name", "Smiles",
"DC50 (nM)", "Dmax (%)", "Assay (DC50/Dmax)", "Percent degradation (%)",
"Assay (Percent degradation)", "Article DOI", "Molecular Weight",
"Exact Mass", "logP", "logS", "Heavy Atom Count", "Ring Count",
"Hydrogen Bond Acceptor Count", "Hydrogen Bond Donor Count",
"Rotatable Bond Count", "Topological Polar Surface Area", "Molecular Formula",
"InChI", "InChI Key"
],
dtype=str,
encoding=ENCODING
)
#get2nodes = test.groupby(['Target', 'E3 ligase']).size().reset_index().rename(columns={0: 'count'})
#print(get2nodes.head(5))
#getComb(protacdb_df)
#getComb(protacdb_df)
def add_rel(
tx: Transaction,
#df: pd.DataFrame,
get2nodes: pd.DataFrame,
node_mapping_dict: dict
):
if __name__ == '__main__':
#main()
#getComb(protacdb_df)
get2nodes = test.groupby(['Target', 'E3 ligase']).size().reset_index().rename(columns={0: 'count'})
#print(get2nodes.head(5))
print(list(get2nodes.columns))
print(get2nodes['E3 ligase'][0:5])