-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeggMeta.py
132 lines (116 loc) · 4.05 KB
/
keggMeta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/python3
import os.path
import urllib
import json
import requests
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import inchi
from urllib.error import HTTPError
from requests.exceptions import Timeout
import time
'''
csv 是kegg_compounds的库文件
file 是输出文件,会得到化合物分类,和kegg号
wantedNameFile 是鉴定结果,要求至少其中一列的表头是 “Name”,这一列的内容为化合物名称
round是迭代次数,默认20次,为了防止网络问题导致的掉线默认迭代20次以保证可以跑通
'''
csv = r"D:\WeChatStory\WeChat Files\wxid_6ydkydyqkwwb21\FileStorage\File\2023-11\kegg_compounds.csv"
file = r"D:\WeChatStory\WeChat Files\wxid_6ydkydyqkwwb21\FileStorage\File\2023-11\out2.txt"
wantedNameFile = r"C:\Users\yuxik\Desktop\LW\gwas\learnTTF\工作簿12.txt"
round = 20
kegg = pd.read_csv(csv)
kegg_inchikeys = []
for smi in kegg['smile'].values:
try:
mol = Chem.MolFromSmiles(smi)
inchikey = inchi.MolToInchiKey(mol)
except:
inchikey = ''
kegg_inchikeys.append(inchikey)
kegg['inchikey'] = kegg_inchikeys
def openfile(path: str):
lines = []
with open(path, 'r', encoding='utf-8') as f_open:
file_lines = f_open.read().split('\n')
for line in file_lines:
if len(line) == 0 or '#' == line[0]:
continue
lines.append(line)
return lines
def get_smiles_from_name(name: str):
base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound"
operation = "name/" + name + "/property/IsomericSMILES/JSON"
url = f"{base_url}/{operation}"
try:
response = requests.get(url, timeout=5)
response.raise_for_status()
except Timeout:
print("请求超时,正在重试...")
return get_smiles_from_name(name)
except requests.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
return None
else:
data = response.json()
smiles = data["PropertyTable"]["Properties"][0]['IsomericSMILES']
return smiles
def get_class_from_smiles(smiles: str):
try:
safe_smiles = urllib.parse.quote(smiles)
base_url = "https://npclassifier.ucsd.edu/classify?smiles="
url = base_url + safe_smiles
with urllib.request.urlopen(url) as inf:
result = inf.read()
if result is not None:
result = json.loads(result)['class_results'][0]
return result
except HTTPError as e:
if e.code == 404:
return None
else:
print("An HTTP error occurred:", e)
return None
except Exception as e:
print("An error occurred:", e)
return None
def get_keggid_from_smiles(smiles: str):
try:
mol = Chem.MolFromSmiles(smiles)
inchikey = inchi.MolToInchiKey(mol)
except:
return None
w = np.where(kegg['inchikey'].values == inchikey)
if len(w[0]) > 0:
keggid = kegg['ID'].values[w[0][0]]
return keggid
else:
return None
out = []
df = pd.read_table(wantedNameFile)
nn = 0
n = 0
for times in range(round):
if not os.path.exists(file):
pass
elif len(openfile(file)) < len(openfile(wantedNameFile))-1 and len(openfile(file)[-1].split('\t')) < 4:
n = int(openfile(file)[-1].split('\t')[0])-1
elif len(openfile(file)) < len(openfile(wantedNameFile))-1 and len(openfile(file)) >= 1:
n = int(openfile(file)[-1].split('\t')[0])
for name in df['Name'].tolist()[n:]:
name = name.replace('"', '').replace("'", "")
smiles = get_smiles_from_name(name)
if smiles == 'None':
continue
npclass = get_class_from_smiles(smiles)
keggid = get_keggid_from_smiles(smiles)
n+=1
print(f"{n}\t{name}\t{npclass}\t{keggid}")
out.append(str(n)+'\t'+name+'\t'+str(npclass)+'\t'+str(keggid))
time.sleep(5)
with open(file, 'a', encoding='utf-8') as f:
f.write(str(n)+'\t'+name+'\t'+str(npclass)+'\t'+str(keggid) + '\n')
else:
print('All done')
exit()