-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathneighborhood.py
172 lines (145 loc) · 5.34 KB
/
neighborhood.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""Compress single cell data using embedding neighborhoods."""
import gc
import numpy as np
import pandas as pd
import scanpy as sc
def _compress_neighborhood(
adata,
obs_bio,
celltype_column,
additional_groupby_columns,
measurement_type="gene_expression",
max_cells_per_type=300,
avg_neighborhoods=3,
):
"""Compress local neighborhood of a single cell type."""
# Try something easy first, like k-means
from sklearn.cluster import KMeans
from scipy.spatial import ConvexHull
features = adata.var_names
groupby_columns = [celltype_column] + list(additional_groupby_columns)
# Subsample with some regard for cell typing
# NOTE: this can be debated, but is also designed to get rid of "outlier cells"
# which are the opposite of an approximation in a sense
cell_ids = []
idx = np.zeros(adata.n_obs, bool)
for _, row in obs_bio.iterrows():
# Reconstruct indices of focal cells
idx[:] = True
for col in groupby_columns:
idx &= adata.obs[col] == row[col]
cell_ids_ct = adata.obs_names[idx]
ncell = row["cell_count"]
if ncell > max_cells_per_type:
idx_rand = np.random.choice(
range(ncell), size=max_cells_per_type, replace=False
)
cell_ids_ct = cell_ids_ct[idx_rand]
cell_ids.extend(list(cell_ids_ct))
adata = adata[cell_ids].copy()
##############################################
# USE AN EXISTING EMBEDDING OR MAKE A NEW ONE
emb_keys = ["umap", "tsne"]
for emb_key in emb_keys:
if f"X_{emb_key}" in adata.obsm:
break
else:
emb_key = "umap"
# Log
sc.pp.log1p(adata)
# Select features
sc.pp.highly_variable_genes(adata)
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
# Create embedding, a proxy for cell states broadly
sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
points = adata.obsm[f"X_{emb_key}"]
# Back to all features for storage
adata = adata.raw.to_adata()
adata.obsm[f"X_{emb_key}"] = points
# Back to cptt or equivalent for storage
adata.X.data = np.expm1(adata.X.data)
##############################################
points = adata.obsm[f"X_{emb_key}"]
# Do a global clustering, ensuring at least 3 cells
# for each cluster so you can make convex hulls
if len(groupby_columns) > 1:
row_order = pd.MultiIndex.from_frame(obs_bio[groupby_columns])
else:
row_order = pd.Index(obs_bio[groupby_columns[0]])
for n_clusters in range(avg_neighborhoods * len(obs_bio), 1, -1):
kmeans = KMeans(
n_clusters=n_clusters,
random_state=0,
n_init="auto",
).fit(points)
labels = kmeans.labels_
# Book keep how many cells of each time are in each cluster
tmp = adata.obs[groupby_columns].copy()
tmp["kmeans"] = labels
tmp["c"] = 1.0
ncells_per_label = (
tmp.groupby(["kmeans"] + groupby_columns, observed=False)
.size()
.unstack(0, fill_value=0)
.T
)
del tmp
# Ensure the order is the same as the averages
ncells_per_label = ncells_per_label.loc[:, row_order]
if ncells_per_label.sum(axis=1).min() >= 3:
break
else:
raise ValueError("Cannot cluster neighborhoods")
n_neis = kmeans.n_clusters
nei_avg = pd.DataFrame(
np.zeros((len(features), n_neis), np.float32),
index=features,
)
nei_coords = pd.DataFrame(
np.zeros((2, n_neis), np.float32),
index=["x", "y"],
)
convex_hulls = []
if measurement_type == "gene_expression":
nei_frac = pd.DataFrame(
np.zeros((len(features), n_neis), np.float32),
index=features,
)
for i in range(kmeans.n_clusters):
idx = kmeans.labels_ == i
# Add the average expression
nei_avg.iloc[:, i] = np.asarray(adata.X[idx].mean(axis=0))[0]
# Add the fraction expressing
if measurement_type == "gene_expression":
nei_frac.iloc[:, i] = np.asarray((adata.X[idx] > 0).mean(axis=0))[0]
# Add the coordinates of the center
points_i = points[idx]
nei_coords.iloc[:, i] = points_i.mean(axis=0)
# Add the convex hull
hull = ConvexHull(points_i)
convex_hulls.append(points_i[hull.vertices])
# TODO: Approximate vector field for velocity in this embedding if available.
# If not, we cannot recount the matrix outselves but if they layers are present,
# we could recompute it and splash it onto our embedding, THEN recycle the arrow
# flow algo from mpl to approximate
# Clean up
del adata
gc.collect()
nei_avg.columns = ncells_per_label.index
nei_coords.columns = ncells_per_label.index
if measurement_type == "gene_expression":
nei_frac.columns = ncells_per_label.index
neid = {
"kind": "neighborhood",
"obs_names": nei_avg.index.values,
"cell_count": ncells_per_label.values,
"coords_centroid": nei_coords.values,
"convex_hull": convex_hulls,
"Xave": nei_avg.values,
}
if measurement_type == "gene_expression":
neid["Xfrac"] = nei_frac.values
return neid