-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCTD_disease_distance.py
executable file
·123 lines (103 loc) · 3.54 KB
/
CTD_disease_distance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import sys
import numpy as np
import networkx as nx
import itertools as it
import random as rd
import scipy.stats as st
import os.path
import pandas as pd
from collections import (defaultdict,Counter)
import time
import matplotlib.pyplot as plt
import pickle as pk
chem_gene_df = pd.read_csv("input/CTD/CTD_chem_gene_ixns.tsv",delimiter= '\t',
skipinitialspace=True)
chem_homo = chem_gene_df[(chem_gene_df['Organism'] == 'Homo sapiens')]
chem_gene = {}
for i,v in chem_homo.iterrows():
try:
chem_gene[v["ChemicalID"]] |= {v["GeneSymbol"]}
except KeyError as e:
chem_gene[v["ChemicalID"]] = set([v["GeneSymbol"]])
#Here, we remove the elements that do not perturb any genes:
chem_gene_cleaned = {}
tot_gene_list=[]
for k,v in chem_gene.items():
if len(v)>0:
chem_gene_cleaned[k]=v
for gene in v:
tot_gene_list.append(gene)
else:
pass
ppi = pd.read_csv("input/PPI/autocore_symbol_lcc.csv",delimiter= ',',
skipinitialspace=True)
G_ppi = nx.from_pandas_edgelist(ppi, 'symbol1', 'symbol2')
G_ppi_lcc = G_ppi.subgraph(max(nx.connected_components(G_ppi), key=len)) # extract lcc graph
print(G_ppi_lcc.number_of_nodes())
print(G_ppi_lcc.number_of_edges())
chem_gene_dictio_cleaned_ppi={}
for k,v in chem_gene_cleaned.items():
new_list=[]
for gene in v:
if gene in G_ppi_lcc.nodes():
new_list.append(gene)
else:
pass
if len(new_list)>0:
chem_gene_dictio_cleaned_ppi[k]=new_list
else:
pass
#Loading the gene associations
gene_associations = pd.read_csv("input/Disease/all_gene_disease_associations.tsv",
delimiter= '\t',
skipinitialspace=True)
gene_associations_filtered=gene_associations[gene_associations['score']>0.3]
diseases_genes_associated = {}
for i,v in gene_associations_filtered.iterrows():
try:
diseases_genes_associated[v["diseaseName"]].append(v["geneSymbol"])
except KeyError:
diseases_genes_associated[v["diseaseName"]] = [v["geneSymbol"]]
diseases_genes_associated_ppi={}
for k,v in diseases_genes_associated.items():
gene_list=[]
for gene in v:
if gene in G_ppi_lcc.nodes():
gene_list.append(gene)
else:
pass
if len(gene_list)>0:
diseases_genes_associated_ppi[k]=gene_list
else:
pass
#Let's import the spl dictionary
with open('intermediate/ppi_spl.pickle', 'rb') as handle:
spl = pk.load(handle)
def calculate_closest_distance(spl, nodes_from, nodes_to):
values_outer = []
for node_from in nodes_from:
values = []
for node_to in nodes_to:
if node_from==node_to:
val =0
else:
try:
val = spl[node_from,node_to]
except:
val = spl[node_to,node_from]
values.append(val)
d = min(values)
#print d,
values_outer.append(d)
d = np.mean(values_outer)
#print d
return d
exp_dis_distance={}
for exp in chem_gene_dictio_cleaned_ppi.keys():
for dis in diseases_genes_associated_ppi.keys():
exp_gene_list_cleaned=[x for x in chem_gene_dictio_cleaned_ppi[exp] if str(x) != 'nan']
dis_gene_list_cleaned=[x for x in diseases_genes_associated_ppi[dis] if str(x) != 'nan']
exp_dis_distance[exp,dis]=calculate_closest_distance(spl,exp_gene_list_cleaned,dis_gene_list_cleaned)
with open('intermediate/exp_disease_distance.pickle', 'wb') as handle:
pk.dump(exp_dis_distance, handle, protocol=pk.HIGHEST_PROTOCOL)