From 5535813172d73b96772768a741c34a71a480b8e8 Mon Sep 17 00:00:00 2001 From: Saul Goodenough Date: Fri, 4 Oct 2024 10:11:22 +0800 Subject: [PATCH] Delete some files --- modify_packages.md | 78 ----- run_came/Clustering_script_human.py | 170 ----------- run_came/Clustering_script_human_3d.py | 193 ------------ run_came/Clustering_script_human_3d_16.py | 191 ------------ run_came/Clustering_script_mouse.py | 170 ----------- run_came/brain_voxel_sample.py | 268 ---------------- run_came/brain_voxel_sample_human_mouse.py | 273 ----------------- .../brain_voxel_sample_human_mouse_16_11.py | 273 ----------------- run_came/brain_voxel_sample_mouse_human.py | 273 ----------------- .../brain_voxel_sample_mouse_human_16_11.py | 273 ----------------- ...l_sample_mouse_human_16_11_no_threshold.py | 273 ----------------- ...n_voxel_sample_mouse_human_no_threshold.py | 273 ----------------- ...ample_mouse_human_no_threshold_dilution.py | 288 ------------------ ...ample_mouse_human_no_threshold_sagittal.py | 273 ----------------- run_came/pca_script_human.py | 85 ------ run_came/pca_script_mouse.py | 81 ----- run_came/run_came_plot.py | 281 ----------------- 17 files changed, 3716 deletions(-) delete mode 100644 modify_packages.md delete mode 100644 run_came/Clustering_script_human.py delete mode 100644 run_came/Clustering_script_human_3d.py delete mode 100644 run_came/Clustering_script_human_3d_16.py delete mode 100644 run_came/Clustering_script_mouse.py delete mode 100644 run_came/brain_voxel_sample.py delete mode 100644 run_came/brain_voxel_sample_human_mouse.py delete mode 100644 run_came/brain_voxel_sample_human_mouse_16_11.py delete mode 100644 run_came/brain_voxel_sample_mouse_human.py delete mode 100644 run_came/brain_voxel_sample_mouse_human_16_11.py delete mode 100644 run_came/brain_voxel_sample_mouse_human_16_11_no_threshold.py delete mode 100644 run_came/brain_voxel_sample_mouse_human_no_threshold.py delete mode 100644 run_came/brain_voxel_sample_mouse_human_no_threshold_dilution.py delete mode 100644 run_came/brain_voxel_sample_mouse_human_no_threshold_sagittal.py delete mode 100644 run_came/pca_script_human.py delete mode 100644 run_came/pca_script_mouse.py delete mode 100644 run_came/run_came_plot.py diff --git a/modify_packages.md b/modify_packages.md deleted file mode 100644 index f3dd170..0000000 --- a/modify_packages.md +++ /dev/null @@ -1,78 +0,0 @@ -# Brain Alignment - -## Data Process - -## Run Commands - -## install packages -```shell -conda create -n brainalign python=3.8 -conda activate brainalign - -pip install "scanpy[leiden]" -pip install torch # >=1.8 -pip install dgl - -pip install yacs --upgrade - -pip install xgboost - -pip install scGeneFit - -pip install colorcet -README.md -pip install imblearn - -pip install gseapy -``` - -# Modification of the other packages -## gseapy -- Add fig to gseapy.dotplot's return, i.e., change -```python - if ofname is None: - return ax - dot.fig.savefig(ofname, bbox_inches="tight", dpi=300) -``` -to: -```python - if ofname is None: - return ax - dot.fig.savefig(ofname, bbox_inches="tight", dpi=300) -``` -- Change dotplot colorbar title pad, i.e., change -```python -cbar.ax.set_title(self.cbar_title, loc="left", fontweight="bold") -``` -```python -cbar.ax.set_title(self.cbar_title, loc="left", fontweight="bold", pad=20) -``` - -- delete '\n' in dotplot colorbar title -```python -ax.legend( - handles, - labels, - title="% Genes in set", - bbox_to_anchor=(1.02, 0.9), - loc="upper left", - frameon=False, - labelspacing=1.0, - ) -``` - -## Scanpy -### UMAP -Add legend position in line 1109 -```python - if legend_loc == 'lower center': - for label in cats: - ax.scatter([], [], c=palette[label], label=label) - ax.legend( - frameon=False, - loc='lower center', - bbox_to_anchor=(0.45, -0.3), - ncol=(1 if len(cats) <= 14 else 2 if len(cats) <= 30 else 3), - fontsize=legend_fontsize, - ) -``` \ No newline at end of file diff --git a/run_came/Clustering_script_human.py b/run_came/Clustering_script_human.py deleted file mode 100644 index 350bafe..0000000 --- a/run_came/Clustering_script_human.py +++ /dev/null @@ -1,170 +0,0 @@ - - -import scanpy as sc, anndata as ad -import numpy as np -import pandas as pd -from sklearn.cluster import KMeans -from sklearn.metrics import adjusted_rand_score - - -if __name__ == "__main__": - - - path_rawdata_mouse = "./brain_human_mouse/human_brain_region_88_sparse.h5ad" - adata = sc.read_h5ad(path_rawdata_mouse) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - ''' - sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4'], return_fig=True).savefig('./figs/mouse_67/leiden.png') - sc.tl.dendrogram(adata, groupby = "leiden_0.6") - sc.pl.dendrogram(adata, groupby = "leiden_0.6", return_fig=True).savefig('./figs/mouse_67/leiden_groupby.png') - - - sc.tl.louvain(adata, key_added = "louvain_1.0") # default resolution in 1.0 - sc.tl.louvain(adata, resolution = 0.6, key_added = "louvain_0.6") - sc.tl.louvain(adata, resolution = 0.4, key_added = "louvain_0.4") - sc.tl.louvain(adata, resolution = 1.4, key_added = "louvain_1.4") - - sc.pl.umap(adata, color=['louvain_0.4', 'louvain_0.6', 'louvain_1.0','louvain_1.4'], return_fig=True).savefig('./figs/mouse_67/louvain.png') - - sc.tl.dendrogram(adata, groupby = "louvain_0.6") - sc.pl.dendrogram(adata, groupby = "louvain_0.6", return_fig=True).savefig('./figs/mouse_67/louvain_groupby.png') - ''' - - - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/human_88/pca.png') - - - - sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 20) - sc.tl.umap(adata) - - sc.pl.umap(adata, color=['region_name'], return_fig=True).savefig('./figs/human_88/umap.png') - - print(adata) - - - # extract pca coordinates - X_pca = adata.obsm['X_pca'] - - # kmeans with k=5 - kmeans = KMeans(n_clusters=5, random_state=0).fit(X_pca) - adata.obs['kmeans5'] = kmeans.labels_.astype(str) - - # kmeans with k=11 - kmeans = KMeans(n_clusters=16, random_state=0).fit(X_pca) - adata.obs['kmeans16'] = kmeans.labels_.astype(str) - - # kmeans with k=20 - kmeans = KMeans(n_clusters=30, random_state=0).fit(X_pca) - adata.obs['kmeans30'] = kmeans.labels_.astype(str) - - # kmeans with k=67 - kmeans = KMeans(n_clusters=88, random_state=0).fit(X_pca) - adata.obs['kmeans88'] = kmeans.labels_.astype(str) - - sc.pl.umap(adata, color=['kmeans5', 'kmeans16', 'kmeans30', 'kmeans88'], return_fig=True).savefig('./figs/human_88/k-means.png') - - - - - - path_rawdata_mouse = "./brain_human_mouse_16_11/human_brain_region_16_sparse.h5ad" - adata = sc.read_h5ad(path_rawdata_mouse) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/human_16/pca.png') - - - sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 20) - sc.tl.umap(adata) - - sc.pl.umap(adata, color=['region_name'], return_fig=True).savefig('./figs/human_16/umap.png') - - print(adata) - - - # extract pca coordinates - X_pca = adata.obsm['X_pca'] - - # kmeans with k=5 - kmeans = KMeans(n_clusters=5, random_state=0).fit(X_pca) - adata.obs['kmeans5'] = kmeans.labels_.astype(str) - - # kmeans with k=11 - kmeans = KMeans(n_clusters=16, random_state=0).fit(X_pca) - adata.obs['kmeans16'] = kmeans.labels_.astype(str) - - # kmeans with k=20 - kmeans = KMeans(n_clusters=30, random_state=0).fit(X_pca) - adata.obs['kmeans30'] = kmeans.labels_.astype(str) - - # kmeans with k=67 - #kmeans = KMeans(n_clusters=67, random_state=0).fit(X_pca) - #adata.obs['kmeans67'] = kmeans.labels_.astype(str) - - sc.pl.umap(adata, color=['kmeans5', 'kmeans16', 'kmeans30'], return_fig=True).savefig('./figs/human_16/k-means.png') diff --git a/run_came/Clustering_script_human_3d.py b/run_came/Clustering_script_human_3d.py deleted file mode 100644 index 590aaaa..0000000 --- a/run_came/Clustering_script_human_3d.py +++ /dev/null @@ -1,193 +0,0 @@ - - -import scanpy as sc, anndata as ad -import numpy as np -import pandas as pd -from sklearn.cluster import KMeans -from sklearn.metrics import adjusted_rand_score -import matplotlib.pyplot as plt - -import re, seaborn as sns - -from mpl_toolkits.mplot3d import Axes3D -from matplotlib.colors import ListedColormap - - - -if __name__ == "__main__": - - - path_rawdata_mouse = "./brain_human_mouse/human_brain_region_88_sparse_with3d.h5ad" - adata = sc.read_h5ad(path_rawdata_mouse) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - ''' - sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4'], return_fig=True).savefig('./figs/mouse_67/leiden.png') - sc.tl.dendrogram(adata, groupby = "leiden_0.6") - sc.pl.dendrogram(adata, groupby = "leiden_0.6", return_fig=True).savefig('./figs/mouse_67/leiden_groupby.png') - - - sc.tl.louvain(adata, key_added = "louvain_1.0") # default resolution in 1.0 - sc.tl.louvain(adata, resolution = 0.6, key_added = "louvain_0.6") - sc.tl.louvain(adata, resolution = 0.4, key_added = "louvain_0.4") - sc.tl.louvain(adata, resolution = 1.4, key_added = "louvain_1.4") - - sc.pl.umap(adata, color=['louvain_0.4', 'louvain_0.6', 'louvain_1.0','louvain_1.4'], return_fig=True).savefig('./figs/mouse_67/louvain.png') - - sc.tl.dendrogram(adata, groupby = "louvain_0.6") - sc.pl.dendrogram(adata, groupby = "louvain_0.6", return_fig=True).savefig('./figs/mouse_67/louvain_groupby.png') - ''' - - - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/human_88/pca.png') - - - - sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 20) - sc.tl.umap(adata) - - sc.pl.umap(adata, color=['region_name'], return_fig=True).savefig('./figs/human_88/umap.png') - - print(adata) - - - # extract pca coordinates - X_pca = adata.obsm['X_pca'] - - # kmeans with k=5 - kmeans = KMeans(n_clusters=5, random_state=0).fit(X_pca) - adata.obs['kmeans5'] = kmeans.labels_.astype(str) - - # kmeans with k=11 - kmeans = KMeans(n_clusters=16, random_state=0).fit(X_pca) - adata.obs['kmeans16'] = kmeans.labels_.astype(str) - - # kmeans with k=20 - kmeans = KMeans(n_clusters=30, random_state=0).fit(X_pca) - adata.obs['kmeans30'] = kmeans.labels_.astype(str) - - # kmeans with k=67 - kmeans = KMeans(n_clusters=88, random_state=0).fit(X_pca) - adata.obs['kmeans88'] = kmeans.labels_.astype(str) - - sc.pl.umap(adata, color=['kmeans5', 'kmeans16', 'kmeans30', 'kmeans88'], return_fig=True).savefig('./figs/human_88/k-means.png') - - print(adata) - #print(adata.obs['kmeans88']) - - # Create the figure - fig = plt.figure() - ax = fig.add_subplot(111, projection='3d') - - # Generate the values - x_vals = adata.obs['mri_voxel_x'].values - y_vals = adata.obs['mri_voxel_y'].values - z_vals = adata.obs['mri_voxel_z'].values - color_region = adata.obs['region_name'].values - - unique_region_list = list(set(color_region)) - color_list = [] - for region in color_region: - color_list.append(unique_region_list.index(region)) - # Plot the values - #ax.scatter(x_vals, y_vals, z_vals, c = 'b', marker='o') - #ax.set_xlabel('X-axis') - #ax.set_ylabel('Y-axis') - #ax.set_zlabel('Z-axis') - - #plt.savefig('./figs/human_88/mri_xyy.png') - - - - - # axes instance - fig = plt.figure(figsize=(10,10)) - ax = Axes3D(fig, auto_add_to_figure=False) - fig.add_axes(ax) - - # get colormap from seaborn - cmap = ListedColormap(sns.color_palette(n_colors=88, as_cmap=True)) - - # plot - sc = ax.scatter(x_vals, y_vals, z_vals, s=10, c=color_list, marker='o', cmap=cmap, alpha=1) - ax.set_xlabel('MRI X') - ax.set_ylabel('MRI Y') - ax.set_zlabel('MRI Z') - - # legend - plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2) - - # save - plt.savefig("./figs/human_88/scatter_xyz_region_name.png", bbox_inches='tight') - - - - - color_list_str = list(adata.obs['kmeans88'].values) - color_list = [int(x) for x in color_list_str] - - # axes instance - fig = plt.figure(figsize=(10,10)) - ax = Axes3D(fig, auto_add_to_figure=False) - fig.add_axes(ax) - - # get colormap from seaborn - cmap = ListedColormap(sns.color_palette(n_colors=88, as_cmap=True)) - - # plot - sc = ax.scatter(x_vals, y_vals, z_vals, s=10, c=color_list, marker='o', cmap=cmap, alpha=1) - ax.set_xlabel('MRI X') - ax.set_ylabel('MRI Y') - ax.set_zlabel('MRI Z') - - # legend - plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2) - - # save - plt.savefig("./figs/human_88/scatter_xyz_kmeans_88.png", bbox_inches='tight') - - - #sc.pl.scatter(adata, x=['mri_voxel_x'], y=['mri_voxel_y'], color=['region_name'], save='./figs/human_88/mri_xy_region.png') - #sc.pl.scatter(adata, x=['mri_voxel_x'], y=['mri_voxel_z'], color=['region_name'], save='./figs/human_88/mri_xz_region.png') - #sc.pl.scatter(adata, x=['mri_voxel_y'], y=['mri_voxel_z'], color=['region_name'], save='./figs/human_88/mri_yz_region.png') - - - - - - diff --git a/run_came/Clustering_script_human_3d_16.py b/run_came/Clustering_script_human_3d_16.py deleted file mode 100644 index 65dde86..0000000 --- a/run_came/Clustering_script_human_3d_16.py +++ /dev/null @@ -1,191 +0,0 @@ - - -import scanpy as sc, anndata as ad -import numpy as np -import pandas as pd -from sklearn.cluster import KMeans -from sklearn.metrics import adjusted_rand_score -import matplotlib.pyplot as plt - -import re, seaborn as sns - -from mpl_toolkits.mplot3d import Axes3D -from matplotlib.colors import ListedColormap - - - -if __name__ == "__main__": - - - path_rawdata_mouse = "./brain_human_mouse_16_11/human_brain_region_16_sparse_with3d.h5ad" - adata = sc.read_h5ad(path_rawdata_mouse) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - ''' - sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4'], return_fig=True).savefig('./figs/mouse_67/leiden.png') - sc.tl.dendrogram(adata, groupby = "leiden_0.6") - sc.pl.dendrogram(adata, groupby = "leiden_0.6", return_fig=True).savefig('./figs/mouse_67/leiden_groupby.png') - - - sc.tl.louvain(adata, key_added = "louvain_1.0") # default resolution in 1.0 - sc.tl.louvain(adata, resolution = 0.6, key_added = "louvain_0.6") - sc.tl.louvain(adata, resolution = 0.4, key_added = "louvain_0.4") - sc.tl.louvain(adata, resolution = 1.4, key_added = "louvain_1.4") - - sc.pl.umap(adata, color=['louvain_0.4', 'louvain_0.6', 'louvain_1.0','louvain_1.4'], return_fig=True).savefig('./figs/mouse_67/louvain.png') - - sc.tl.dendrogram(adata, groupby = "louvain_0.6") - sc.pl.dendrogram(adata, groupby = "louvain_0.6", return_fig=True).savefig('./figs/mouse_67/louvain_groupby.png') - ''' - - - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/human_16/pca.png') - - - - sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 20) - sc.tl.umap(adata) - - sc.pl.umap(adata, color=['region_name'], return_fig=True).savefig('./figs/human_16/umap.png') - - print(adata) - - - # extract pca coordinates - X_pca = adata.obsm['X_pca'] - - # kmeans with k=5 - kmeans = KMeans(n_clusters=5, random_state=0).fit(X_pca) - adata.obs['kmeans5'] = kmeans.labels_.astype(str) - - # kmeans with k=11 - kmeans = KMeans(n_clusters=16, random_state=0).fit(X_pca) - adata.obs['kmeans16'] = kmeans.labels_.astype(str) - - # kmeans with k=20 - kmeans = KMeans(n_clusters=30, random_state=0).fit(X_pca) - adata.obs['kmeans30'] = kmeans.labels_.astype(str) - - - - sc.pl.umap(adata, color=['kmeans5', 'kmeans16', 'kmeans30'], return_fig=True).savefig('./figs/human_16/k-means.png') - - print(adata) - #print(adata.obs['kmeans88']) - - # Create the figure - fig = plt.figure() - ax = fig.add_subplot(111, projection='3d') - - # Generate the values - x_vals = adata.obs['mri_voxel_x'].values - y_vals = adata.obs['mri_voxel_y'].values - z_vals = adata.obs['mri_voxel_z'].values - color_region = adata.obs['region_name'].values - - unique_region_list = list(set(color_region)) - color_list = [] - for region in color_region: - color_list.append(unique_region_list.index(region)) - # Plot the values - #ax.scatter(x_vals, y_vals, z_vals, c = 'b', marker='o') - #ax.set_xlabel('X-axis') - #ax.set_ylabel('Y-axis') - #ax.set_zlabel('Z-axis') - - #plt.savefig('./figs/human_88/mri_xyy.png') - - - - - # axes instance - fig = plt.figure(figsize=(10,10)) - ax = Axes3D(fig, auto_add_to_figure=False) - fig.add_axes(ax) - - # get colormap from seaborn - cmap = ListedColormap(sns.color_palette(n_colors=16, as_cmap=True)) - - # plot - sc = ax.scatter(x_vals, y_vals, z_vals, s=10, c=color_list, marker='o', cmap=cmap, alpha=1) - ax.set_xlabel('MRI X') - ax.set_ylabel('MRI Y') - ax.set_zlabel('MRI Z') - - # legend - plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2) - - # save - plt.savefig("./figs/human_16/scatter_xyz_region_name.png", bbox_inches='tight') - - - - - color_list_str = list(adata.obs['kmeans16'].values) - color_list = [int(x) for x in color_list_str] - - # axes instance - fig = plt.figure(figsize=(10,10)) - ax = Axes3D(fig, auto_add_to_figure=False) - fig.add_axes(ax) - - # get colormap from seaborn - cmap = ListedColormap(sns.color_palette(n_colors=16, as_cmap=True)) - - # plot - sc = ax.scatter(x_vals, y_vals, z_vals, s=10, c=color_list, marker='o', cmap=cmap, alpha=1) - ax.set_xlabel('MRI X') - ax.set_ylabel('MRI Y') - ax.set_zlabel('MRI Z') - - # legend - plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2) - - # save - plt.savefig("./figs/human_16/scatter_xyz_kmeans_16.png", bbox_inches='tight') - - - #sc.pl.scatter(adata, x=['mri_voxel_x'], y=['mri_voxel_y'], color=['region_name'], save='./figs/human_88/mri_xy_region.png') - #sc.pl.scatter(adata, x=['mri_voxel_x'], y=['mri_voxel_z'], color=['region_name'], save='./figs/human_88/mri_xz_region.png') - #sc.pl.scatter(adata, x=['mri_voxel_y'], y=['mri_voxel_z'], color=['region_name'], save='./figs/human_88/mri_yz_region.png') - - - - - - diff --git a/run_came/Clustering_script_mouse.py b/run_came/Clustering_script_mouse.py deleted file mode 100644 index dffb29f..0000000 --- a/run_came/Clustering_script_mouse.py +++ /dev/null @@ -1,170 +0,0 @@ - - -import scanpy as sc, anndata as ad -import numpy as np -import pandas as pd -from sklearn.cluster import KMeans -from sklearn.metrics import adjusted_rand_score - - -if __name__ == "__main__": - - - path_rawdata_mouse = "./brain_human_mouse/mouse_brain_region_67_sparse.h5ad" - adata = sc.read_h5ad(path_rawdata_mouse) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - ''' - sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4'], return_fig=True).savefig('./figs/mouse_67/leiden.png') - sc.tl.dendrogram(adata, groupby = "leiden_0.6") - sc.pl.dendrogram(adata, groupby = "leiden_0.6", return_fig=True).savefig('./figs/mouse_67/leiden_groupby.png') - - - sc.tl.louvain(adata, key_added = "louvain_1.0") # default resolution in 1.0 - sc.tl.louvain(adata, resolution = 0.6, key_added = "louvain_0.6") - sc.tl.louvain(adata, resolution = 0.4, key_added = "louvain_0.4") - sc.tl.louvain(adata, resolution = 1.4, key_added = "louvain_1.4") - - sc.pl.umap(adata, color=['louvain_0.4', 'louvain_0.6', 'louvain_1.0','louvain_1.4'], return_fig=True).savefig('./figs/mouse_67/louvain.png') - - sc.tl.dendrogram(adata, groupby = "louvain_0.6") - sc.pl.dendrogram(adata, groupby = "louvain_0.6", return_fig=True).savefig('./figs/mouse_67/louvain_groupby.png') - ''' - - - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/mouse_67/pca.png') - - - - sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 20) - sc.tl.umap(adata) - - sc.pl.umap(adata, color=['region_name'], return_fig=True).savefig('./figs/mouse_67/umap.png') - - print(adata) - - - # extract pca coordinates - X_pca = adata.obsm['X_pca'] - - # kmeans with k=5 - kmeans = KMeans(n_clusters=5, random_state=0).fit(X_pca) - adata.obs['kmeans5'] = kmeans.labels_.astype(str) - - # kmeans with k=11 - kmeans = KMeans(n_clusters=11, random_state=0).fit(X_pca) - adata.obs['kmeans11'] = kmeans.labels_.astype(str) - - # kmeans with k=20 - kmeans = KMeans(n_clusters=20, random_state=0).fit(X_pca) - adata.obs['kmeans20'] = kmeans.labels_.astype(str) - - # kmeans with k=67 - kmeans = KMeans(n_clusters=67, random_state=0).fit(X_pca) - adata.obs['kmeans67'] = kmeans.labels_.astype(str) - - sc.pl.umap(adata, color=['kmeans5', 'kmeans11', 'kmeans20', 'kmeans67'], return_fig=True).savefig('./figs/mouse_67/k-means.png') - - - - - - path_rawdata_mouse = "./brain_human_mouse_16_11/mouse_brain_region_11_sparse.h5ad" - adata = sc.read_h5ad(path_rawdata_mouse) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/mouse_11/pca.png') - - - sc.pp.neighbors(adata, n_pcs = 30, n_neighbors = 20) - sc.tl.umap(adata) - - sc.pl.umap(adata, color=['region_name'], return_fig=True).savefig('./figs/mouse_11/umap.png') - - print(adata) - - - # extract pca coordinates - X_pca = adata.obsm['X_pca'] - - # kmeans with k=5 - kmeans = KMeans(n_clusters=5, random_state=0).fit(X_pca) - adata.obs['kmeans5'] = kmeans.labels_.astype(str) - - # kmeans with k=11 - kmeans = KMeans(n_clusters=11, random_state=0).fit(X_pca) - adata.obs['kmeans11'] = kmeans.labels_.astype(str) - - # kmeans with k=20 - kmeans = KMeans(n_clusters=20, random_state=0).fit(X_pca) - adata.obs['kmeans20'] = kmeans.labels_.astype(str) - - # kmeans with k=67 - #kmeans = KMeans(n_clusters=67, random_state=0).fit(X_pca) - #adata.obs['kmeans67'] = kmeans.labels_.astype(str) - - sc.pl.umap(adata, color=['kmeans5', 'kmeans11', 'kmeans20'], return_fig=True).savefig('./figs/mouse_11/k-means.png') diff --git a/run_came/brain_voxel_sample.py b/run_came/brain_voxel_sample.py deleted file mode 100644 index 2511482..0000000 --- a/run_came/brain_voxel_sample.py +++ /dev/null @@ -1,268 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_human', 'Baron_mouse') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = './brain_human_mouse/human_brain_region_88_imputed.h5ad' - path_rawdata2 = './brain_human_mouse/mouse_brain_region_67.h5ad' - - path_varmap = './came/sample_data/gene_matches_human2mouse.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_human2mouse.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_human_mouse/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsnames}-{time_tag}' # set by user - figdir = resdir + '/figs' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=3) - sc.pp.filter_genes(adata_raw2, min_cells=3) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 300 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 4096 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 50 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - - ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir / f'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir / 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir / 'obs.csv') - adt.write(resdir / 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - f_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir / 'adt_hidden_gene1.h5ad') - gadt2.write(resdir / 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir / f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir / f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'human', 'mouse' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir / f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir / 'abs_graph.pickle') - - - - - diff --git a/run_came/brain_voxel_sample_human_mouse.py b/run_came/brain_voxel_sample_human_mouse.py deleted file mode 100644 index c272571..0000000 --- a/run_came/brain_voxel_sample_human_mouse.py +++ /dev/null @@ -1,273 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_human', 'Baron_mouse') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = './brain_human_mouse/human_brain_region_88_sparse.h5ad' - path_rawdata2 = './brain_human_mouse/mouse_brain_region_67_sparse.h5ad' - - path_varmap = './came/sample_data/gene_matches_human2mouse.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_human2mouse.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_human_mouse/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsnames}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=3) - sc.pp.filter_genes(adata_raw2, min_cells=3) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 300 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 2048 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 50 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'human', 'mouse' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - - - - diff --git a/run_came/brain_voxel_sample_human_mouse_16_11.py b/run_came/brain_voxel_sample_human_mouse_16_11.py deleted file mode 100644 index ba30c9e..0000000 --- a/run_came/brain_voxel_sample_human_mouse_16_11.py +++ /dev/null @@ -1,273 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_human', 'Baron_mouse') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = './brain_human_mouse_16_11/human_brain_region_16_sparse.h5ad' - path_rawdata2 = './brain_human_mouse_16_11/mouse_brain_region_11_sparse.h5ad' - - path_varmap = './came/sample_data/gene_matches_human2mouse.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_human2mouse.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_human_mouse_16_11/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsnames}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=3) - sc.pp.filter_genes(adata_raw2, min_cells=3) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 300 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 2048 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 50 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'human', 'mouse' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - - - - diff --git a/run_came/brain_voxel_sample_mouse_human.py b/run_came/brain_voxel_sample_mouse_human.py deleted file mode 100644 index 7bcbfd0..0000000 --- a/run_came/brain_voxel_sample_mouse_human.py +++ /dev/null @@ -1,273 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_mouse', 'Baron_human') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = './brain_human_mouse/mouse_brain_region_67_sparse.h5ad' - path_rawdata2 = './brain_human_mouse/human_brain_region_88_sparse.h5ad' - - path_varmap = './came/sample_data/gene_matches_mouse2human.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_mouse2human.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_human_mouse/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsnames}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=3) - sc.pp.filter_genes(adata_raw2, min_cells=3) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 400 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 4096 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 50 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'mouse', 'human' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - - - - diff --git a/run_came/brain_voxel_sample_mouse_human_16_11.py b/run_came/brain_voxel_sample_mouse_human_16_11.py deleted file mode 100644 index aeca354..0000000 --- a/run_came/brain_voxel_sample_mouse_human_16_11.py +++ /dev/null @@ -1,273 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_mouse', 'Baron_human') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = './brain_human_mouse_16_11/mouse_brain_region_11_sparse.h5ad' - path_rawdata2 = './brain_human_mouse_16_11/human_brain_region_16_sparse.h5ad' - - path_varmap = './came/sample_data/gene_matches_mouse2human.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_mouse2human.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_human_mouse_16_11/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsnames}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=3) - sc.pp.filter_genes(adata_raw2, min_cells=3) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 300 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 4096 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 50 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'human', 'mouse' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - - - - diff --git a/run_came/brain_voxel_sample_mouse_human_16_11_no_threshold.py b/run_came/brain_voxel_sample_mouse_human_16_11_no_threshold.py deleted file mode 100644 index eb3dc24..0000000 --- a/run_came/brain_voxel_sample_mouse_human_16_11_no_threshold.py +++ /dev/null @@ -1,273 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_mouse', 'Baron_human') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = '../Brain_ST_human_mouse/data/mouse_brain_region_11_sparse_no_threshold.h5ad' - path_rawdata2 = './brain_human_mouse_16_11/human_brain_region_16_sparse.h5ad' - - path_varmap = './came/sample_data/gene_matches_mouse2human.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_mouse2human.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_human_mouse_16_11/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsnames}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=3) - sc.pp.filter_genes(adata_raw2, min_cells=3) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 300 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 4096 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 50 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'human', 'mouse' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - - - - diff --git a/run_came/brain_voxel_sample_mouse_human_no_threshold.py b/run_came/brain_voxel_sample_mouse_human_no_threshold.py deleted file mode 100644 index 8d1fbf8..0000000 --- a/run_came/brain_voxel_sample_mouse_human_no_threshold.py +++ /dev/null @@ -1,273 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_mouse', 'Baron_human') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = '../Brain_ST_human_mouse/data/mouse_brain_region_67_sparse_no_threshold.h5ad' - path_rawdata2 = './brain_human_mouse/human_brain_region_88_sparse.h5ad' - - path_varmap = './came/sample_data/gene_matches_mouse2human.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_mouse2human.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_mouse_human_no_threshold_500/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsnames}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=1) - sc.pp.filter_genes(adata_raw2, min_cells=1) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 400 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 4096 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 70 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'mouse', 'human' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - - - - diff --git a/run_came/brain_voxel_sample_mouse_human_no_threshold_dilution.py b/run_came/brain_voxel_sample_mouse_human_no_threshold_dilution.py deleted file mode 100644 index 099ef09..0000000 --- a/run_came/brain_voxel_sample_mouse_human_no_threshold_dilution.py +++ /dev/null @@ -1,288 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2022 - -@author: Biao Zhang -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch -from heco_utils import threshold_quantile - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_mouse', 'Baron_human') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = '../Brain_ST_human_mouse/data/mouse_brain_region_67_sparse_no_threshold.h5ad' - path_rawdata2 = './brain_human_mouse/human_brain_region_88_sparse.h5ad' - - path_varmap = './came/sample_data/gene_matches_mouse2human.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_mouse2human.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_mouse_human_no_threshold_sparse/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsn1}-{dsn2}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=1) - sc.pp.filter_genes(adata_raw2, min_cells=1) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 300 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 4096 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 70 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - print('came_results:', came_inputs) - - print('Mouse features: ', came_inputs['adatas'][0]) - X = came_inputs['adatas'][0].X.toarray() - print(type(X)) - came_inputs['adatas'][0].X = sparse.csr_matrix(threshold_quantile(X, quantile_gene=0.8, quantile_sample=0.99)) - - print('Human features: ', came_inputs['adatas'][1]) - X = came_inputs['adatas'][1].X.toarray() - print(type(X)) - came_inputs['adatas'][1].X = sparse.csr_matrix(threshold_quantile(X, quantile_gene=0.8, quantile_sample=0.99)) - - - - - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'mouse', 'human' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - - diff --git a/run_came/brain_voxel_sample_mouse_human_no_threshold_sagittal.py b/run_came/brain_voxel_sample_mouse_human_no_threshold_sagittal.py deleted file mode 100644 index 5ce6f9a..0000000 --- a/run_came/brain_voxel_sample_mouse_human_no_threshold_sagittal.py +++ /dev/null @@ -1,273 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -if __name__ == '__main__': - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_mouse', 'Baron_human') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = '../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad' - path_rawdata2 = './brain_human_mouse/human_brain_region_88_sparse.h5ad' - - path_varmap = './came/sample_data/gene_matches_mouse2human.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_mouse2human.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = '/home1/zhangbiao/CAME/brain_mouse_human_sagittal_1000/' - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsn1}-{dsn2}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=1) - sc.pp.filter_genes(adata_raw2, min_cells=1) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 400 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 4096 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 70 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'mouse', 'human' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - - - - diff --git a/run_came/pca_script_human.py b/run_came/pca_script_human.py deleted file mode 100644 index 02cdba3..0000000 --- a/run_came/pca_script_human.py +++ /dev/null @@ -1,85 +0,0 @@ - - -import scanpy as sc, anndata as ad -import numpy as np -import pandas as pd - - -if __name__ == "__main__": - - - path_rawdata_human = "./brain_human_mouse/human_brain_region_88_sparse.h5ad" - adata = sc.read_h5ad(path_rawdata_human) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/human_88/pca.png') - - - - - - - path_rawdata_human = "./brain_human_mouse_16_11/human_brain_region_16_sparse.h5ad" - adata = sc.read_h5ad(path_rawdata_human) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/human_16/pca.png') - diff --git a/run_came/pca_script_mouse.py b/run_came/pca_script_mouse.py deleted file mode 100644 index 622b5b7..0000000 --- a/run_came/pca_script_mouse.py +++ /dev/null @@ -1,81 +0,0 @@ - - -import scanpy as sc, anndata as ad -import numpy as np -import pandas as pd - - -if __name__ == "__main__": - - - path_rawdata_mouse = "./brain_human_mouse/mouse_brain_region_67_sparse.h5ad" - adata = sc.read_h5ad(path_rawdata_mouse) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/mouse_67/pca.png') - - - path_rawdata_mouse = "./brain_human_mouse_16_11/mouse_brain_region_11_sparse.h5ad" - adata = sc.read_h5ad(path_rawdata_mouse) - - sc.settings.verbosity = 3 - #sc.settings.set_figure_params(dpi=80) - - - #sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0 - #sc.tl.leiden(adata, resolution = 0.6, key_added = "leiden_0.6") - #sc.tl.leiden(adata, resolution = 0.4, key_added = "leiden_0.4") - #sc.tl.leiden(adata, resolution = 1.4, key_added = "leiden_1.4") - - #sc.pl.umap(adata, color=['leiden_0.4', 'leiden_0.6', 'leiden_1.0','leiden_1.4']) - - #sc.tl.dendrogram(adata, groupby = "leiden_0.6") - #sc.pl.dendrogram(adata, groupby = "leiden_0.6") - - - #sc.pp.log1p(adata) - # store normalized counts in the raw slot, - # we will subset adata.X for variable genes, but want to keep all genes matrix as well. - #adata.raw = adata - #adata.X = pd.df(adata.X).fillna(0) - #sc.pp.highly_variable_genes(adata, min_mean=0.001, max_mean=3, min_disp=0.1) - #print("Highly variable genes: %d"%sum(adata.var.highly_variable)) - - #plot variable genes - #sc.pl.highly_variable_genes(adata) - - # subset for variable genes in the dataset - #adata = adata[:, adata.var['highly_variable']] - - sc.tl.pca(adata, svd_solver='arpack') - sc.pl.pca(adata, color='region_name', components = ['1,2','3,4','5,6','7,8'], ncols=2, return_fig=True).savefig('./figs/mouse_11/pca.png') - diff --git a/run_came/run_came_plot.py b/run_came/run_came_plot.py deleted file mode 100644 index 4ee3915..0000000 --- a/run_came/run_came_plot.py +++ /dev/null @@ -1,281 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Wed May 5 20:08:52 2021 - -@author: Xingyan Liu -""" -import came -from came import pipeline, pp, pl - -import logging -import os -import sys -from pathlib import Path - -import numpy as np -import pandas as pd -from matplotlib import pyplot as plt # optional -import seaborn as sns # optional - -import scanpy as sc -from scipy import sparse - -import networkx as nx -import torch - - -try: - import matplotlib as mpl - mpl.use('agg') -except Exception as e: - print(f"An error occurred when setting matplotlib backend ({e})") - -def run_came(path_rawdata1 = '../Brain_ST_human_mouse/data/mouse_brain_region_67_sagittal.h5ad', - path_rawdata2 = './brain_human_mouse/human_brain_region_88_sparse.h5ad', - ROOT_path = '/home1/zhangbiao/CAME/brain_mouse_human_sagittal/'): - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s %(filename)s-%(lineno)d-%(funcName)s(): ' - '%(levelname)s\n %(message)s') - - #came.__test1__(10, batch_size=None, reverse=False) - #came.__test2__(10, batch_size=2048) - dsnames = ('Baron_mouse', 'Baron_human') # the dataset names, set by user - dsn1, dsn2 = dsnames - - path_rawdata1 = path_rawdata1 - path_rawdata2 = path_rawdata2 - - path_varmap = './came/sample_data/gene_matches_mouse2human.csv' - path_varmap_1v1 = './came/sample_data/gene_matches_1v1_mouse2human.csv' - - # ========= load data ========= - df_varmap = pd.read_csv(path_varmap) - df_varmap_1v1 = pd.read_csv(path_varmap_1v1) if path_varmap_1v1 else came.pp.take_1v1_matches(df_varmap) - - adata_raw1 = sc.read_h5ad(path_rawdata1) - adata_raw2 = sc.read_h5ad(path_rawdata2) - adatas = [adata_raw1, adata_raw2] - - key_class1 = 'region_name' # set by user - key_class2 = 'region_name' # set by user - - ROOT = ROOT_path - time_tag = came.make_nowtime_tag() - resdir = ROOT + f'{dsn1}-{dsn2}-{time_tag}/' # set by user - figdir = resdir + '/figs/' - came.check_dirs(figdir) # check and make the directory - - sc.pp.filter_genes(adata_raw1, min_cells=1) - sc.pp.filter_genes(adata_raw2, min_cells=1) - - # Inspect classes - if key_class2 is not None: - group_counts_ori = pd.concat([ - pd.value_counts(adata_raw1.obs[key_class1]), - pd.value_counts(adata_raw2.obs[key_class2]), - ], axis=1, keys=dsnames) - else: - group_counts_ori = pd.value_counts(adata_raw1.obs[key_class1]) - - print(group_counts_ori) - - - # The numer of training epochs - # (a recommended setting is 200-400 for whole-graph training, and 80-200 for sub-graph training) - n_epochs = 400 - - # The training batch size - # When the GPU memory is limited, set 4096 or more if possible. - batch_size = 4096 - - # The number of epochs to skip for checkpoint backup - n_pass = 100 - - # Whether to use the single-cell networks - use_scnets = True - - # The number of top DEGs to take as the node-features of each cells. - # You set it 70-100 for distant species pairs. - ntop_deg = 70 - - # The number of top DEGs to take as the graph nodes, which can be directly displayed on the UMAP plot. - ntop_deg_nodes = 50 - # The source of the node genes; use both DEGs and HVGs by default - node_source = 'deg,hvg' - - # Whether to take into account the non-1v1 variables as the node features. - keep_non1v1_feats = True - - came_inputs, (adata1, adata2) = pipeline.preprocess_unaligned( - adatas, - key_class=key_class1, - use_scnets=use_scnets, - ntop_deg=ntop_deg, - ntop_deg_nodes=ntop_deg_nodes, - node_source=node_source, - ) - - outputs = pipeline.main_for_unaligned( - **came_inputs, - df_varmap=df_varmap, - df_varmap_1v1=df_varmap_1v1, - dataset_names=dsnames, - key_class1=key_class1, - key_class2=key_class2, - do_normalize=True, - keep_non1v1_feats=keep_non1v1_feats, - n_epochs=n_epochs, - resdir=resdir, - n_pass=n_pass, - batch_size=batch_size, - plot_results=True, - device_id = '1', - ) - - dpair = outputs['dpair'] - trainer = outputs['trainer'] - h_dict = outputs['h_dict'] - out_cell = outputs['out_cell'] - predictor = outputs['predictor'] - - - obs_ids1, obs_ids2 = dpair.obs_ids1, dpair.obs_ids2 - obs = dpair.obs - classes = predictor.classes - - # plot figures - y_true = obs['celltype'][obs_ids2].values - y_pred = obs['predicted'][obs_ids2].values - print(y_true.shape) - print(y_pred.shape) - print(type(y_true[0])) - print(type(y_pred[0])) - - #ax, contmat = pl.plot_contingency_mat(y_true, y_pred, norm_axis=1, order_rows=False, order_cols=False,) - #pl._save_with_adjust(ax.figure, figdir / 'contingency_mat.png') - #ax.figure - - name_label = 'celltype' - cols_anno = ['celltype', 'predicted'][:] - df_probs = obs[list(classes)] - - - gs = pl.wrapper_heatmap_scores(df_probs.iloc[obs_ids2], obs.iloc[obs_ids2], ignore_index=True, - col_label='celltype', col_pred='predicted', - n_subsample=50, # sampling 50 cells for each group - cmap_heat='magma_r', - fp=figdir + 'heatmap_probas.pdf') - - gs.figure - - - - # further analysis - hidden_list = came.load_hidden_states(resdir + 'hidden_list.h5') - #hidden_list # a list of dicts - h_dict = hidden_list[-1] - # the last layer of hidden states - - adt = pp.make_adata(h_dict['cell'], obs=dpair.obs, assparse=False, ignore_index=True) - gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - - # UMAP of cell embeddings - sc.set_figure_params(dpi_save=200) - - sc.pp.neighbors(adt, n_neighbors=15, metric='cosine', use_rep='X') - sc.tl.umap(adt) - # sc.pl.umap(adt, color=['dataset', 'celltype'], ncols=1) - - ftype = ['.svg', ''][1] - sc.pl.umap(adt, color='dataset', save=f'-dataset{ftype}') - sc.pl.umap(adt, color='celltype', save=f'-ctype{ftype}') - - obs_umap = adt.obsm['X_umap'] - obs['UMAP1'] = obs_umap[:, 0] - obs['UMAP2'] = obs_umap[:, 1] - obs.to_csv(resdir + 'obs.csv') - adt.write(resdir + 'adt_hidden_cell.h5ad') - - adata1.obsm['X_umap'] = obs_umap[obs_ids1] - adata2.obsm['X_umap'] = obs_umap[obs_ids2] - - # Umap of genes - sc.pp.neighbors(gadt, n_neighbors=15, metric='cosine', use_rep='X') - - # gadt = pp.make_adata(h_dict['gene'], obs=dpair.var.iloc[:, :2], assparse=False, ignore_index=True) - sc.tl.umap(gadt) - sc.pl.umap(gadt, color='dataset', ) - - # joint gene module extraction - sc.tl.leiden(gadt, resolution=.8, key_added='module') - sc.pl.umap(gadt, color='module', ncols=1, palette='tab20b') - - # gadt.obs_names = gadt.obs_names.astype(str) - gadt1, gadt2 = pp.bisplit_adata(gadt, 'dataset', dsnames[0], reset_index_by='name') - - color_by = 'module' - palette = 'tab20b' - sc.pl.umap(gadt1, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - sc.pl.umap(gadt2, color=color_by, s=10, edges=True, edges_width=0.05, - palette=palette, - save=f'_{color_by}-{dsnames[0]}') - - df_var_links = came.weight_linked_vars( - gadt.X, dpair._vv_adj, names=dpair.get_vnode_names(), - matric='cosine', index_names=dsnames,) - - gadt1.write(resdir + 'adt_hidden_gene1.h5ad') - gadt2.write(resdir + 'adt_hidden_gene2.h5ad') - - # Gene-expression-profiles (for each cell type) on gene UMAP - # averaged expressions - avg_expr1 = pp.group_mean_adata(adatas[0], groupby=key_class1,features=dpair.vnode_names1, use_raw=True) - avg_expr2 = pp.group_mean_adata(adatas[1], groupby=key_class2 if key_class2 else 'predicted',features=dpair.vnode_names2, use_raw=True) - - plkwds = dict(cmap='RdYlBu_r', vmax=2.5, vmin=-1.5, do_zscore=True, - axscale=3, ncols=5, with_cbar=True) - - fig1, axs1 = pl.adata_embed_with_values(gadt1, avg_expr1, fp=figdir + f'umap_exprAvgs-{dsnames[0]}-all.png', **plkwds) - fig2, axs2 = pl.adata_embed_with_values(gadt2, avg_expr2, fp=figdir + f'umap_exprAvgs-{dsnames[1]}-all.png', **plkwds) - - # Abstracted graph - norm_ov = ['max', 'zs', None][1] - cut_ov = 0. - - groupby_var = 'module' - obs_labels1, obs_labels2 = adt.obs['celltype'][dpair.obs_ids1], \ - adt.obs['celltype'][dpair.obs_ids2] - var_labels1, var_labels2 = gadt1.obs[groupby_var], gadt2.obs[groupby_var] - - sp1, sp2 = 'mouse', 'human' - g = came.make_abstracted_graph( - obs_labels1, obs_labels2, - var_labels1, var_labels2, - avg_expr1, avg_expr2, - df_var_links, - tags_obs=(f'{sp1} ', f'{sp2} '), - tags_var=(f'{sp1} module ', f'{sp2} module '), - cut_ov=cut_ov, - norm_mtd_ov=norm_ov,) - - #visualization - fp_abs = figdir + f'abstracted_graph-{groupby_var}-cut{cut_ov}-{norm_ov}.pdf' - ax = pl.plot_multipartite_graph(g, edge_scale=10, figsize=(9, 7.5), alpha=0.5, fp=fp_abs) # nodelist=nodelist, - - ax.figure - - came.save_pickle(g, resdir + 'abs_graph.pickle') - - return resdir - - - - - - - - -