-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
162 lines (143 loc) · 5.86 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""A collection of utility functions (mostly file reading/writing) used
in the PathCORE-T analysis scripts"""
import os
import pandas as pd
def load_expression_dataset(path_to_file):
"""
Parameters
-----------
path_to_file : str
The path to the expression dataset.
Currently expects a tab-delimited file with row and column names,
where the rows are the genes and the columns are the samples.
Returns
-----------
pandas.DataFrame
"""
expression_dataset = pd.read_table(path_to_file, header=0)
index_on = expression_dataset.columns[0]
expression_dataset.set_index(index_on, inplace=True)
expression_dataset = expression_dataset[
-expression_dataset.index.str.contains('?', regex=False)]
expression_dataset = expression_dataset.sort_index()
return expression_dataset
def load_pathway_definitions(path_to_file, shorten_pathway_names=None):
"""
Parameters
-----------
path_to_file : str
The path to the pathway definitions file.
Currently expects a tab-delimited file with no header and 3 columns:
pathway, N (number of genes), gene1;gene2;...geneN
shorten_pathway_names : function(str) (default=None)
A function that accepts a pathway name (str) and returns the shortened
pathway name (str). See `shorten_pathway_names.py` in `constants`.
Returns
-----------
dict(str -> set(str)), A dictionary of pathway definitions,
where a pathway (key) is mapped to a set of genes (value)
"""
pathway_definitions = pd.read_table(
path_to_file, header=None,
names=["pw", "size", "genes"])
pathway_definitions["genes"] = pathway_definitions["genes"].map(
lambda x: x.split(";"))
pathway_definitions.set_index("pw", inplace=True)
pathway_definitions_map = {}
for index, row in pathway_definitions.iterrows():
if shorten_pathway_names:
index = shorten_pathway_names(index)
pathway_definitions_map[index] = set(row["genes"])
return pathway_definitions_map
def load_significant_pathways_file(path_to_file, shorten_pathway_names=None):
"""
Parameters
-----------
path_to_file : str
The path to the significant pathways file, generated from
`pathcore_network_from_model_features.py`
Currently expects a tab-delimited file with a header containing
columns [feature, side, pathway] at minimum.
shorten_pathway_names : function(str) (default=None)
A function that accepts a pathway name (str) and returns the shortened
pathway name (str). See `shorten_pathway_names.py` in `constants`.
Returns
-----------
pandas.DataFrame
"""
significant_pathways = pd.read_table(
path_to_file, header=0, usecols=["feature", "side", "pathway"])
if shorten_pathway_names:
significant_pathways["pathway"] = \
significant_pathways["pathway"].apply(
lambda x: shorten_pathway_names(x))
significant_pathways = significant_pathways.sort_values(
by=["feature", "side"])
return significant_pathways
def load_weight_matrix(path_to_file, n_features,
n_genes=None, path_to_genes_file=None):
"""
Parameters
-----------
path_to_file : str
The path to the tab-delimited weight matrix file, generated by
applying a feature construction algorithm to a compendium of
gene expression data.
The weight matrix should have shape [n, k] where n is the number of
genes and k the number of features.
See `generate_nmf_model.py` for an example. (Additional examples:
the PCA loadings matrix, ICA unmixing matrix, ADAGE/eADAGE weight
matrix)
- Features must contain the full set of genes in the compendium
- Genes must have been assigned weights that quantify their
contribution to a given feature
n_features : int
The number of features constructed.
n_genes : int (default=None)
The number of genes in the compendium. Used to handle the case where
a weight matrix file contains additional information after the
weight matrix dataframe (occurs in the eADAGE models used for the
paper).
path_to_genes_file : str (default=None)
A file of gene identifiers, 1 identifier per line. Used to handle the
case where a weight matrix file does not already specify the gene
identifiers as row names.
Returns
-----------
pandas.DataFrame
"""
read_csv_args = {
"header": None,
"sep": "\t",
"index_col": 0 if not path_to_genes_file else None
}
if n_genes:
read_csv_args["nrows"] = n_genes
weight_matrix = _weight_matrix_file_skip_rows(
path_to_file, n_features, read_csv_args)
if path_to_genes_file:
genes_df = pd.read_csv(
path_to_genes_file, header=None, names=["genes"])
weight_matrix = pd.concat([weight_matrix, genes_df], axis=1)
weight_matrix.set_index("genes", inplace=True)
else:
weight_matrix.index.rename("genes", inplace=True)
weight_matrix.columns = list(range(n_features))
return weight_matrix
def _weight_matrix_file_skip_rows(path_to_file, n_features, kwargs):
"""Helper function to read the weight matrix file. Used to detect the
start of the weight matrix file based on the number of features (=> number
of columns) expected.
"""
if os.stat(path_to_file).st_size == 0:
raise ValueError("{0} is empty.".format(path_to_file))
with open(path_to_file) as file:
pos = 0
current_line = file.readline()
data = current_line.split("\t")
while len(data) < n_features:
pos = file.tell()
current_line = file.readline()
data = current_line.split("\t")
file.seek(pos)
return pd.read_csv(file, **kwargs)