-
Notifications
You must be signed in to change notification settings - Fork 0
/
propagation.py
80 lines (63 loc) · 3.06 KB
/
propagation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import networkx as nx
import numpy as np
import pandas as pd
import sys
import tqdm
parameter = sys.argv[1]
ppi = nx.read_gml("data/ppi.gml")
# list out the nodes in the network
nodelist = list(ppi.nodes())
#
unnormalized_propagator = nx.to_scipy_sparse_array(ppi, nodelist = nodelist)
import scipy.sparse
# make a diagonal matrix to rescale the rows of the matrix
# making a random walk adjacency matrix
degree_array = [1/ppi.degree(i) for i in nodelist]
degree_normalization = scipy.sparse.diags(degree_array)
# apply it to the unnormalize propagator to make
# the one-step random walk propagator
propagator = degree_normalization @ unnormalized_propagator
# build up the random walk with restart operatorr
beta = int(parameter)/100
random_walker = scipy.sparse.eye(len(nodelist))
cum_propagator = scipy.sparse.eye(len(nodelist))
for i in range(6):
print(i)
cum_propagator = propagator @ cum_propagator
random_walker += (beta**i)*cum_propagator
random_walker = (1-beta)*random_walker
mutation_unstacked = pd.read_csv("data/HMF_somatic_PASS_GENE_HIGH.txt", sep="\t", header=None)
mutation_unstacked[4] = 1
mutation = pd.pivot_table(mutation_unstacked, index=0, columns=3, values=4).T
mutation = mutation.reindex(nodelist).fillna(0.0)
smoothed_mutations = []
for i in tqdm.tqdm(mutation.columns):
smoothed_mutations.append(random_walker * mutation.loc[:,i])
smoothed_mutation = pd.DataFrame(np.vstack(smoothed_mutations).T, index = mutation.index, columns=mutation.columns)
# removed rows and columns with all zeros
# representing samples with no mutations in PPI genes and
# isolated, unmutated genes in the PPI respectively
smoothed_mutation = smoothed_mutation.T[(smoothed_mutation**2).sum(axis=0) > 0].T
smoothed_mutation = smoothed_mutation[(smoothed_mutation**2).sum(axis=1) > 0]
import sklearn.decomposition
# PCA to find the characteristic dimensions of
# "mutation space"
pca = sklearn.decomposition.PCA(n_components=50)
# transfomred features are per sample and are the
# condensed representation of the mutational profile of each sample
transformed_features = pd.DataFrame(pca.fit_transform(smoothed_mutation), index=smoothed_mutation.index)
transformed_features.to_csv(f"data/pca_smooth_features_"+parameter+".csv")
# the compnents represent which genes'mutationns comttribute the
# prrofile of eacch sample
components = pd.DataFrame(pca.components_, columns=smoothed_mutation.columns)
components.to_csv("data/pca_components_"+parameter+".csv")
draw=False
if draw:
os.makedirs("figures/smoothing_"+parameter, exist_ok = True)
# replace the ensembl ids in the collumnns with hgnc gene symbols
smoothed_mutation_symbol = smoothed_mutation.copy()
smoothed_mutation_symbol.columns = smoothed_mutation.columns.map(lambda x : ensembl_symbol.get(x, "Pass"))
# plot the relatiosnhip between genes and samples after smoothng
sns.clustermap(smoothed_mutation_symbol, vmax=0.05, metric='cosine')
plt.savefig("figures/smoothing_"+parameter+"/clustermap_smoothed_mutation.png", dpi=500)
plt.savefig("figures/smoothing_"+parameter+"/clustermap_smoothed_mutation.svg")