Skip to content

Commit 197b82e

Browse files
Merge pull request #76 from volkamerlab/customize-matrix
Move FingerprintDistanceGenerator.*_matrix methods to matrix module
2 parents 95f61ae + fb42cab commit 197b82e

File tree

3 files changed

+175
-82
lines changed

3 files changed

+175
-82
lines changed

kissim/comparison/fingerprint_distance_generator.py

Lines changed: 5 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@
88
import logging
99

1010
from tqdm.auto import tqdm
11-
import numpy as np
12-
import pandas as pd
1311

1412
from kissim.comparison import BaseGenerator, FingerprintDistance, FeatureDistancesGenerator
13+
from kissim.comparison import matrix
1514
from kissim.comparison.utils import format_weights
1615

1716
logger = logging.getLogger(__name__)
@@ -25,7 +24,7 @@ class FingerprintDistanceGenerator(BaseGenerator):
2524
Attributes
2625
----------
2726
data : pandas.DataFrame
28-
Fingerprint distance and bit coverag for each structure pair (kinase pair).
27+
Fingerprint distance and bit coverage for each structure pair (kinase pair).
2928
structure_kinase_ids : list of list
3029
Structure and kinase IDs for structures in dataset.
3130
"""
@@ -214,23 +213,7 @@ def structure_distance_matrix(self, coverage_min=0.0):
214213
Structure distance matrix.
215214
"""
216215

217-
# Filter by coverage
218-
data = self.data[self.data["bit_coverage"] >= coverage_min]
219-
# Data for upper half of the matrix
220-
pairs_upper = data[["structure.1", "structure.2", "distance"]]
221-
# Data for lower half of the matrix
222-
pairs_lower = pairs_upper.rename(
223-
columns={"structure.1": "structure.2", "structure.2": "structure.1"}
224-
)
225-
226-
# Concatenate upper and lower matrix data
227-
pairs = pd.concat([pairs_upper, pairs_lower]).sort_values(["structure.1", "structure.2"])
228-
# Convert to matrix
229-
matrix = pairs.pivot(columns="structure.2", index="structure.1", values="distance")
230-
# Matrix diagonal is NaN > set to 0.0
231-
np.fill_diagonal(matrix.values, 0)
232-
233-
return matrix
216+
return matrix.structure_distance_matrix(self.data, coverage_min)
234217

235218
def kinase_distance_matrix(self, by="minimum", fill_diagonal=True, coverage_min=0.0):
236219
"""
@@ -257,36 +240,7 @@ def kinase_distance_matrix(self, by="minimum", fill_diagonal=True, coverage_min=
257240
Kinase distance matrix.
258241
"""
259242

260-
if by == "size":
261-
fill_diagonal = False
262-
263-
# Data for upper half of the matrix
264-
pairs_upper = self.kinase_distances(by, coverage_min).reset_index()[
265-
["kinase.1", "kinase.2", "distance"]
266-
]
267-
# Data for lower half of the matrix
268-
pairs_lower = pairs_upper.rename(columns={"kinase.1": "kinase.2", "kinase.2": "kinase.1"})
269-
270-
# Concatenate upper and lower matrix data
271-
pairs = (
272-
pd.concat([pairs_upper, pairs_lower])
273-
.sort_values(["kinase.1", "kinase.2"])
274-
.drop_duplicates()
275-
.reset_index(drop=True)
276-
)
277-
278-
# Convert to matrix
279-
matrix = pairs.pivot(columns="kinase.2", index="kinase.1", values="distance")
280-
281-
if fill_diagonal:
282-
np.fill_diagonal(matrix.values, 0)
283-
284-
# If matrix contains number of structure pairs: NaN > 0, cast to int
285-
if by == "size":
286-
matrix = matrix.fillna(0)
287-
matrix = matrix.astype("int64")
288-
289-
return matrix
243+
return matrix.kinase_distance_matrix(self.data, by, fill_diagonal, coverage_min)
290244

291245
def kinase_distances(self, by="minimum", coverage_min=0.0):
292246
"""
@@ -307,35 +261,4 @@ def kinase_distances(self, by="minimum", coverage_min=0.0):
307261
Fingerprint distance and coverage for kinase pairs.
308262
"""
309263

310-
# Filter by coverage
311-
data = self.data[self.data["bit_coverage"] >= coverage_min].reset_index()
312-
# Group by kinase names
313-
structure_distances_grouped_by_kinases = data.groupby(
314-
by=["kinase.1", "kinase.2"], sort=False
315-
)
316-
317-
# Get distance values per kinase pair based on given condition
318-
# Note: For min/max we'd like to know which structure pairs were selected!
319-
by_terms = "minimum maximum mean median size std".split()
320-
321-
if by == "minimum":
322-
kinase_distances = data.iloc[
323-
structure_distances_grouped_by_kinases["distance"].idxmin()
324-
].set_index(["kinase.1", "kinase.2"])
325-
elif by == "maximum":
326-
kinase_distances = data.iloc[
327-
structure_distances_grouped_by_kinases["distance"].idxmax()
328-
].set_index(["kinase.1", "kinase.2"])
329-
elif by == "mean":
330-
kinase_distances = structure_distances_grouped_by_kinases.mean()[["distance"]]
331-
elif by == "median":
332-
kinase_distances = structure_distances_grouped_by_kinases.median()[["distance"]]
333-
elif by == "size":
334-
kinase_distances = structure_distances_grouped_by_kinases.size().to_frame("distance")
335-
elif by == "std":
336-
kinase_distances = structure_distances_grouped_by_kinases.std()[["distance"]]
337-
kinase_distances = round(kinase_distances, 3)
338-
else:
339-
raise ValueError(f'Condition "by" unknown. Choose from: {", ".join(by_terms)}')
340-
341-
return kinase_distances
264+
return matrix.kinase_distances(self.data, by, coverage_min)

kissim/comparison/matrix.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
"""
2+
Calculates structure distance matrices and kinase distance matrices.
3+
"""
4+
5+
import numpy as np
6+
import pandas as pd
7+
8+
9+
def structure_distance_matrix(structure_distances, coverage_min=0.0):
10+
"""
11+
Get fingerprint distances for all structure pairs in the form of a matrix (DataFrame).
12+
13+
Parameters
14+
----------
15+
structure_distances : pandas.DataFrame
16+
Fingerprint distance and bit coverage for each structure pair (kinase pair).
17+
fill : bool
18+
Fill or fill not (default) lower triangle of distance matrix.
19+
coverage_min : float
20+
Returns only pairs with a user-defined minimum coverage (defaults to 0.0, i.e. no
21+
coverage restrictions).
22+
23+
Returns
24+
-------
25+
pandas.DataFrame
26+
Structure distance matrix.
27+
"""
28+
29+
data = structure_distances
30+
31+
# Filter by coverage
32+
data = data[data["bit_coverage"] >= coverage_min]
33+
# Data for upper half of the matrix
34+
pairs_upper = data[["structure.1", "structure.2", "distance"]]
35+
# Data for lower half of the matrix
36+
pairs_lower = pairs_upper.rename(
37+
columns={"structure.1": "structure.2", "structure.2": "structure.1"}
38+
)
39+
40+
# Concatenate upper and lower matrix data
41+
pairs = pd.concat([pairs_upper, pairs_lower]).sort_values(["structure.1", "structure.2"])
42+
# Convert to matrix
43+
matrix = pairs.pivot(columns="structure.2", index="structure.1", values="distance")
44+
# Matrix diagonal is NaN > set to 0.0
45+
np.fill_diagonal(matrix.values, 0)
46+
47+
return matrix
48+
49+
50+
def kinase_distance_matrix(
51+
structure_distances, by="minimum", fill_diagonal=True, coverage_min=0.0
52+
):
53+
"""
54+
Extract per kinase pair one distance value from the set of structure pair distance values
55+
and return these fingerprint distances for all kinase pairs in the form of a matrix
56+
(DataFrame).
57+
58+
Parameters
59+
----------
60+
structure_distances : pandas.DataFrame
61+
Fingerprint distance and bit coverage for each structure pair (kinase pair).
62+
by : str
63+
Condition on which the distance value per kinase pair is extracted from the set of
64+
distances values per structure pair. Default: Minimum distance value.
65+
fill_diagonal : bool
66+
Fill diagonal with 0 (same kinase has distance of 0) by default. If `False`, diagonal
67+
will be a experimental values calculated based on the structure pairs per kinase pair.
68+
Is by default set to False, if `by="size"`.
69+
coverage_min : float
70+
Returns only pairs with a user-defined minimum coverage (defaults to 0.0, i.e. no
71+
coverage restrictions).
72+
73+
Returns
74+
-------
75+
pandas.DataFrame
76+
Kinase distance matrix.
77+
"""
78+
79+
if by == "size":
80+
fill_diagonal = False
81+
82+
# Data for upper half of the matrix
83+
pairs_upper = kinase_distances(structure_distances, by, coverage_min).reset_index()[
84+
["kinase.1", "kinase.2", "distance"]
85+
]
86+
# Data for lower half of the matrix
87+
pairs_lower = pairs_upper.rename(columns={"kinase.1": "kinase.2", "kinase.2": "kinase.1"})
88+
89+
# Concatenate upper and lower matrix data
90+
pairs = (
91+
pd.concat([pairs_upper, pairs_lower])
92+
.sort_values(["kinase.1", "kinase.2"])
93+
.drop_duplicates()
94+
.reset_index(drop=True)
95+
)
96+
97+
# Convert to matrix
98+
matrix = pairs.pivot(columns="kinase.2", index="kinase.1", values="distance")
99+
100+
if fill_diagonal:
101+
np.fill_diagonal(matrix.values, 0)
102+
103+
# If matrix contains number of structure pairs: NaN > 0, cast to int
104+
if by == "size":
105+
matrix = matrix.fillna(0)
106+
matrix = matrix.astype("int64")
107+
108+
return matrix
109+
110+
111+
def kinase_distances(structure_distances, by="minimum", coverage_min=0.0):
112+
"""
113+
Extract per kinase pair one distance value from the set of structure pair distance values.
114+
115+
Parameters
116+
----------
117+
structure_distances : pandas.DataFrame
118+
Fingerprint distance and bit coverage for each structure pair (kinase pair).
119+
by : str
120+
Condition on which the distance value per kinase pair is extracted from the set of
121+
distances values per structure pair. Default: Minimum distance value.
122+
coverage_min : float
123+
Returns only pairs with a user-defined minimum coverage (defaults to 0.0, i.e. no
124+
coverage restrictions).
125+
126+
Returns
127+
-------
128+
pandas.DataFrame
129+
Fingerprint distance and coverage for kinase pairs.
130+
"""
131+
132+
data = structure_distances
133+
134+
# Filter by coverage
135+
data = data[data["bit_coverage"] >= coverage_min].reset_index()
136+
# Group by kinase names
137+
structure_distances_grouped_by_kinases = data.groupby(by=["kinase.1", "kinase.2"], sort=False)
138+
139+
# Get distance values per kinase pair based on given condition
140+
# Note: For min/max we'd like to know which structure pairs were selected!
141+
by_terms = "minimum maximum mean median size std".split()
142+
143+
if by == "minimum":
144+
kinase_distances = data.iloc[
145+
structure_distances_grouped_by_kinases["distance"].idxmin()
146+
].set_index(["kinase.1", "kinase.2"])
147+
elif by == "maximum":
148+
kinase_distances = data.iloc[
149+
structure_distances_grouped_by_kinases["distance"].idxmax()
150+
].set_index(["kinase.1", "kinase.2"])
151+
elif by == "mean":
152+
kinase_distances = structure_distances_grouped_by_kinases.mean()[["distance"]]
153+
elif by == "median":
154+
kinase_distances = structure_distances_grouped_by_kinases.median()[["distance"]]
155+
elif by == "size":
156+
kinase_distances = structure_distances_grouped_by_kinases.size().to_frame("distance")
157+
elif by == "std":
158+
kinase_distances = structure_distances_grouped_by_kinases.std()[["distance"]]
159+
kinase_distances = round(kinase_distances, 3)
160+
else:
161+
raise ValueError(f'Condition "by" unknown. Choose from: {", ".join(by_terms)}')
162+
163+
return kinase_distances

kissim/comparison/tree.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,13 @@ def from_distance_matrix(
122122
# Curate diagonal - set to 0
123123
np.fill_diagonal(distance_matrix.values, 0)
124124

125+
# If matrix contains missing values, respective rows and columns must be dropped
126+
column_has_missing_values = distance_matrix.isna().any()
127+
column_names_with_missing_values = column_has_missing_values[column_has_missing_values].index
128+
distance_matrix = distance_matrix.drop(column_names_with_missing_values, axis=0).drop(
129+
column_names_with_missing_values, axis=1
130+
)
131+
125132
# Hierarchical clustering
126133
logger.info(
127134
f"Clustering (method: {clustering_method}) and "

0 commit comments

Comments
 (0)