From 3bfc53289e9f16a0cd113fa9815d20750bc3e768 Mon Sep 17 00:00:00 2001 From: longyahui <60100581+longyahui@users.noreply.github.com> Date: Wed, 3 May 2023 17:37:08 +0800 Subject: [PATCH] Add files via upload --- inits.py | 82 ++++++++++++++++++++++ main.py | 91 +++++++++++++++++++++++++ model.py | 184 ++++++++++++++++++++++++++++++++++++++++++++++++++ preprocess.py | 182 +++++++++++++++++++++++++++++++++++++++++++++++++ projection.py | 78 +++++++++++++++++++++ train.py | 141 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 758 insertions(+) create mode 100644 inits.py create mode 100644 main.py create mode 100644 model.py create mode 100644 preprocess.py create mode 100644 projection.py create mode 100644 train.py diff --git a/inits.py b/inits.py new file mode 100644 index 0000000..ee6a96d --- /dev/null +++ b/inits.py @@ -0,0 +1,82 @@ +import os +import torch +import random +import time +import numpy as np +import numpy +import pandas as pd +import scanpy as sc +import pickle +import scipy.sparse as sp +from torch.backends import cudnn +from preprocess import read_data +import pandas as pd + +def load_data(args): + """Load data, including ST and ATAC data.""" + #read data + data = read_data(args) + + print('Data reading finished!') + return data + +def sparse_mx_to_torch_sparse_tensor(sparse_mx): + """Convert a scipy sparse matrix to a torch sparse tensor.""" + sparse_mx = sparse_mx.tocoo().astype(np.float32) + indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) + values = torch.from_numpy(sparse_mx.data) + shape = torch.Size(sparse_mx.shape) + return torch.sparse.FloatTensor(indices, values, shape) + +# ====== Graph preprocessing +def preprocess_graph(adj): + adj = sp.coo_matrix(adj) + adj_ = adj + sp.eye(adj.shape[0]) + rowsum = np.array(adj_.sum(1)) + degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten()) + adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() + return sparse_mx_to_torch_sparse_tensor(adj_normalized) + +def get_edge_index(matrix): + edge_index = [[], []] + for i in range(matrix.shape[0]): + for j in range(matrix.shape[1]): + if matrix[i][j] != 0: + edge_index[0].append(i) + edge_index[1].append(j) + return torch.LongTensor(edge_index) #将列表转为张量 + +def fix_seed(seed): + #seed = 666 + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + cudnn.deterministic = True + cudnn.benchmark = False + + os.environ['PYTHONHASHSEED'] = str(seed) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + +def normalize_adj(adj): + """Symmetrically normalize adjacency matrix.""" + adj = sp.coo_matrix(adj) + rowsum = np.array(adj.sum(1)) + d_inv_sqrt = np.power(rowsum, -0.5).flatten() + d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. + d_mat_inv_sqrt = sp.diags(d_inv_sqrt) + adj = adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt) + return adj.toarray() + +def preprocess_adj(adj): + """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation.""" + adj_normalized = normalize_adj(adj)+np.eye(adj.shape[0]) + return adj_normalized + + + + + + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..3ba0107 --- /dev/null +++ b/main.py @@ -0,0 +1,91 @@ +import os +import torch +import argparse +import warnings +import time +from train import Train +from inits import load_data, fix_seed +from utils import UMAP, plot_weight_value +import pickle + +warnings.filterwarnings("ignore") +os.environ['R_HOME'] = '/scbio4/tools/R/R-4.0.3_openblas/R-4.0.3' + +parser = argparse.ArgumentParser(description='PyTorch implementation of spatial multi-omics data integration') +parser.add_argument('--learning_rate', type=float, default=0.0001, help='Initial learning rate.') # 0.0001 +parser.add_argument('--epochs', type=int, default=1500, help='Number of epochs to train.') # 1500 Mouse Brain #1000 SPOTS_spleen_rep1 #700 Thymus +parser.add_argument('--weight_decay', type=float, default=0.0000, help='Weight for L2 loss on embedding matrix.') # 5e-4 +parser.add_argument('--datatype', type=str, default='SPOTS', help='Data type.') +parser.add_argument('--input', type=str, default='/home/yahui/anaconda3/work/SpatialGlue_omics/data/', help='Input path.') +parser.add_argument('--output', type=str, default='/home/yahui/anaconda3/work/SpatialGlue_omics/output/', help='output path.') +parser.add_argument('--random_seed', type=int, default=2022, help='Random seed') # 50 +parser.add_argument('--dim_input', type=int, default=3000, help='Dimension of input features') # 100 +parser.add_argument('--dim_output', type=int, default=64, help='Dimension of output features') # 64 +parser.add_argument('--n_neighbors', type=int, default=6, help='Number of sampling neighbors') # 6 +parser.add_argument('--n_clusters', type=int, default=9, help='Number of clustering') # mouse brain 15 thymus 9 spleen 5 +args = parser.parse_args() + +device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu') +t = time.time() +fix_seed(args.random_seed) + +args.dataset = 'SPOTS_spleen_rep1' + +if args.datatype == 'Stereo-CITE-seq': + args.n_clusters = 8 + args.epochs = 1500 +elif args.datatype == 'Spatial-ATAC-RNA-seq': + args.n_clusters = 15 + args.epochs = 1500 +elif args.datatype == 'SPOTS': + args.n_clusters = 6 + args.epochs = 900 + +print('>>>>>>>>>>>>>>>>> {} <<<<<<<<<<<<<<<<'.format(args.dataset)) + +data = load_data(args) +adata_omics1, adata_omics2 = data['adata_omics1'], data['adata_omics2'] + +#start to train the model +trainer = Train(args, device, data) +emb_omics1, emb_omics2, emb_combined, alpha = trainer.train() +print('time:', time.time()-t) + +adata_omics1.obsm['emb'] = emb_omics1 +adata_omics2.obsm['emb'] = emb_omics2 +adata_omics1.obsm['emb_combined'] = emb_combined +adata_omics2.obsm['emb_combined'] = emb_combined + +adata_omics1.obsm['alpha'] = alpha + +# umap +adata_combined = UMAP(adata_omics1, adata_omics2, args) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/model.py b/model.py new file mode 100644 index 0000000..38a5da3 --- /dev/null +++ b/model.py @@ -0,0 +1,184 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.parameter import Parameter +from torch.nn.modules.module import Module +import numpy as np +#from torch_geometric.nn import GCNConv, GATConv + +class Encoder_omics(Module): + """ + Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 + """ + def __init__(self, dim_in_feat_omics1, dim_out_feat_omics1, dim_in_feat_omics2, dim_out_feat_omics2, dropout=0.0, act=F.relu): + super(Encoder_omics, self).__init__() + self.dim_in_feat_omics1 = dim_in_feat_omics1 + self.dim_in_feat_omics2 = dim_in_feat_omics2 + self.dim_out_feat_omics1 = dim_out_feat_omics1 + self.dim_out_feat_omics2 = dim_out_feat_omics2 + self.dropout = dropout + self.act = act + + self.encoder_omics1 = Encoder(self.dim_in_feat_omics1, self.dim_out_feat_omics1) + self.decoder_omics1 = Decoder(self.dim_out_feat_omics1, self.dim_in_feat_omics1) + self.encoder_omics2 = Encoder(self.dim_in_feat_omics2, self.dim_out_feat_omics2) + self.decoder_omics2 = Decoder(self.dim_out_feat_omics2, self.dim_in_feat_omics2) + + self.atten_omics1 = AttentionLayer(self.dim_out_feat_omics1, self.dim_out_feat_omics1) + self.atten_omics2 = AttentionLayer(self.dim_out_feat_omics2, self.dim_out_feat_omics2) + self.atten_cross = AttentionLayer(self.dim_out_feat_omics1, self.dim_out_feat_omics2) + + self.discriminator = Discriminator(self.dim_out_feat_omics1) + + #def forward(self, feat1, feat2, adj1_1, adj1_2, adj2_1, adj2_2): + def forward(self, features_omics1, features_omics2, adj_spatial_omics1, adj_feature_omics1, adj_spatial_omics2, adj_feature_omics2): + # graph1 + emb_latent_spatial_omics1 = self.encoder_omics1(features_omics1, adj_spatial_omics1) + emb_latent_spatial_omics2 = self.encoder_omics2(features_omics2, adj_spatial_omics2) + + # graph2 + emb_latent_feature_omics1 = self.encoder_omics1(features_omics1, adj_feature_omics1) + emb_latent_feature_omics2 = self.encoder_omics2(features_omics2, adj_feature_omics2) + + # within-modality attention aggregation + emb_latent_omics1, alpha_omics1 = self.atten_omics1(emb_latent_spatial_omics1, emb_latent_feature_omics1) + emb_latent_omics2, alpha_omics2 = self.atten_omics2(emb_latent_spatial_omics2, emb_latent_feature_omics2) + + # between-modality attention aggregation + emb_latent_combined, alpha_omics_1_2 = self.atten_cross(emb_latent_omics1, emb_latent_omics2) + + # reconstruct expression matrix using two modality-specific decoders, respectively + emb_recon_omics1 = self.decoder_omics1(emb_latent_combined, adj_spatial_omics1) + emb_recon_omics2 = self.decoder_omics2(emb_latent_combined, adj_spatial_omics2) + + emb_latent_omics1_across_recon = self.encoder_omics2(self.decoder_omics2(emb_latent_omics1, adj_spatial_omics2), adj_spatial_omics2) # consistent encoding # dim=64 + emb_latent_omics2_across_recon = self.encoder_omics1(self.decoder_omics1(emb_latent_omics2, adj_spatial_omics1), adj_spatial_omics1) + + score_omics1 = self.discriminator(emb_latent_omics1) + score_omics2 = self.discriminator(emb_latent_omics2) + score_omics1=torch.squeeze(score_omics1, dim=1) + score_omics2=torch.squeeze(score_omics2, dim=1) + + results = {'emb_latent_omics1':emb_latent_omics1, + 'emb_latent_omics2':emb_latent_omics2, + 'emb_latent_combined':emb_latent_combined, + 'emb_recon_omics1':emb_recon_omics1, + 'emb_recon_omics2':emb_recon_omics2, + 'emb_latent_omics1_across_recon':emb_latent_omics1_across_recon, + 'emb_latent_omics2_across_recon':emb_latent_omics2_across_recon, + 'alpha':alpha_omics_1_2, + 'score_omics1':score_omics1, + 'score_omics2':score_omics2} + + return results + +class Encoder(Module): + """ + Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 + """ + def __init__(self, in_feat, out_feat, dropout=0.0, act=F.relu): + super(Encoder, self).__init__() + self.in_feat = in_feat + self.out_feat = out_feat + self.dropout = dropout + self.act = act + + self.weight = Parameter(torch.FloatTensor(self.in_feat, self.out_feat)) + + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.xavier_uniform_(self.weight) + + def forward(self, feat, adj): + x = torch.mm(feat, self.weight) + x = torch.spmm(adj, x) + + return x + +class Decoder(Module): + """ + Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 + """ + def __init__(self, in_feat, out_feat, dropout=0.0, act=F.relu): + super(Decoder, self).__init__() + self.in_feat = in_feat + self.out_feat = out_feat + self.dropout = dropout + self.act = act + + self.weight = Parameter(torch.FloatTensor(self.in_feat, self.out_feat)) + + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.xavier_uniform_(self.weight) + + def forward(self, feat, adj): + x = torch.mm(feat, self.weight) + x = torch.spmm(adj, x) + + return x + +class Discriminator(nn.Module): + """Latent space discriminator""" + def __init__(self, dim_input, n_hidden=50, n_out=1): + super(Discriminator, self).__init__() + self.dim_input = dim_input + self.n_hidden = n_hidden + self.n_out = n_out + + self.net = nn.Sequential( + nn.Linear(dim_input, n_hidden), + nn.LeakyReLU(inplace=True), + nn.Linear(n_hidden, 2*n_hidden), + nn.LeakyReLU(inplace=True), + #nn.Linear(n_hidden, n_hidden), + #nn.ReLU(inplace=True), + #nn.Linear(n_hidden, n_hidden), + #nn.ReLU(inplace=True), + #nn.Linear(n_hidden, n_hidden), + #nn.ReLU(inplace=True), + nn.Linear(2*n_hidden,n_out), + nn.Sigmoid(), + ) + + def forward(self, x): + return self.net(x) + +class AttentionLayer(Module): + def __init__(self, in_feat, out_feat, dropout=0.0, act=F.relu): + super(AttentionLayer, self).__init__() + self.in_feat = in_feat + self.out_feat = out_feat + + self.w_omega = Parameter(torch.FloatTensor(in_feat, out_feat)) + self.u_omega = Parameter(torch.FloatTensor(out_feat, 1)) + + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.xavier_uniform_(self.w_omega) + torch.nn.init.xavier_uniform_(self.u_omega) + + def forward(self, emb1, emb2): + emb = [] + emb.append(torch.unsqueeze(torch.squeeze(emb1), dim=1)) + emb.append(torch.unsqueeze(torch.squeeze(emb2), dim=1)) + self.emb = torch.cat(emb, dim=1) + + self.v = F.tanh(torch.matmul(self.emb, self.w_omega)) + self.vu= torch.matmul(self.v, self.u_omega) + self.alpha = F.softmax(torch.squeeze(self.vu) + 1e-6) + + emb_combined = torch.matmul(torch.transpose(self.emb,1,2), torch.unsqueeze(self.alpha, -1)) + + return torch.squeeze(emb_combined), self.alpha + + + + + + + + diff --git a/preprocess.py b/preprocess.py new file mode 100644 index 0000000..9d8fa6a --- /dev/null +++ b/preprocess.py @@ -0,0 +1,182 @@ +import scipy +import anndata +import sklearn +import numpy as np +import scanpy as sc +import pandas as pd +import pickle +from typing import Optional, Union +from scipy.sparse import coo_matrix +from sklearn.neighbors import NearestNeighbors +from sklearn.neighbors import kneighbors_graph + +def read_data(args): + """ + Reading adata + """ + # read adata + ## mouse spleen (SPOTS) + adata_omics1 = sc.read_h5ad('/home/yahui/anaconda3/work/SpatialGlue_omics/data/SPOTS/Mouse_Spleen/Replicate1/adata_RNA.h5ad') + adata_omics2 = sc.read_h5ad('/home/yahui/anaconda3/work/SpatialGlue_omics/data/SPOTS/Mouse_Spleen/Replicate1/adata_Pro.h5ad') + + ## mouse brain (Spatial-ATAC-RNA-seq) + #adata_omics1 = sc.read_h5ad('/home/yahui/anaconda3/work/SpatialGlue_omics/data/Rong_Fan/Mouse_Brain/Mouse_Brain_P22/Mouse_Brain_P22/adata_RNA.h5ad') + #adata_omics2 = sc.read_h5ad('/home/yahui/anaconda3/work/SpatialGlue_omics/data/Rong_Fan/Mouse_Brain/Mouse_Brain_P22/Mouse_Brain_P22/adata_peaks_normalized.h5ad') + + # make feature names unique + adata_omics1.var_names_make_unique() + adata_omics2.var_names_make_unique() + + # data pre-processing + + if args.datatype == 'SPOTS': # mouse spleen + # RNA + sc.pp.filter_genes(adata_omics1, min_cells=10) + sc.pp.highly_variable_genes(adata_omics1, flavor="seurat_v3", n_top_genes=3000) + sc.pp.normalize_total(adata_omics1, target_sum=1e4) + sc.pp.log1p(adata_omics1) + sc.pp.pca(adata_omics1, use_highly_variable=True, n_comps=22) #22 + adata_omics1.obsm['feat'] = adata_omics1.obsm['X_pca'].copy() + + # Protein + adata_omics2 = clr_normalize_each_cell(adata_omics2) + sc.pp.pca(adata_omics2, n_comps=20) + adata_omics2.obsm['feat'] = adata_omics2.obsm['X_pca'].copy() + + elif args.datatype == 'Stereo-CITE-seq': + # RNA + sc.pp.filter_genes(adata_omics1, min_cells=10) + sc.pp.filter_cells(adata_omics1, min_genes=80) + + sc.pp.highly_variable_genes(adata_omics1, flavor="seurat_v3", n_top_genes=3000) + sc.pp.normalize_total(adata_omics1, target_sum=1e4) + sc.pp.log1p(adata_omics1) + sc.pp.pca(adata_omics1, use_highly_variable=True, n_comps=50) + adata_omics1.obsm['feat'] = adata_omics1.obsm['X_pca'].copy() + + # Protein + sc.pp.filter_genes(adata_omics2, min_cells=50) + proteins_to_remove = ['Mouse-IgD','Mouse-CD140a'] + proteins_to_keep = ~adata_omics2.var_names.isin(proteins_to_remove) + adata_omics2 = adata_omics2[:, proteins_to_keep] + + adata_omics2 = adata_omics2[adata_omics1.obs_names].copy() + + adata_omics2 = clr_normalize_each_cell(adata_omics2) + sc.pp.pca(adata_omics2, n_comps=50) #50 + adata_omics2.obsm['feat'] = adata_omics2.obsm['X_pca'].copy() + + elif args.datatype == 'Spatial-ATAC-RNA-seq': + # RNA + sc.pp.filter_genes(adata_omics1, min_cells=10) + sc.pp.filter_cells(adata_omics1, min_genes=200) + + sc.pp.highly_variable_genes(adata_omics1, flavor="seurat_v3", n_top_genes=3000) + sc.pp.normalize_total(adata_omics1, target_sum=1e4) + sc.pp.log1p(adata_omics1) + sc.pp.pca(adata_omics1, use_highly_variable=True, n_comps=50) + adata_omics1.obsm['feat'] = adata_omics1.obsm['X_pca'].copy() + + # ATAC + adata_omics2 = adata_omics2[adata_omics1.obs_names].copy() # .obsm['X_lsi'] represents the dimension reduced feature + adata_omics2.obsm['feat'] = adata_omics2.obsm['X_lsi'].copy() + + # construct cell neighbor graphs + ################# spatial graph ################# + # omics1 + cell_position_omics1 = adata_omics1.obsm['spatial'] + adj_omics1 = build_network(args, cell_position_omics1) + adata_omics1.uns['adj_spatial'] = adj_omics1 + + # omics2 + cell_position_omics2 = adata_omics2.obsm['spatial'] + adj_omics2 = build_network(args, cell_position_omics2) + adata_omics2.uns['adj_spatial'] = adj_omics2 + + ################# feature graph ################# + feature_graph_omics1, feature_graph_omics2 = construct_graph_by_feature(adata_omics1, adata_omics2) + adata_omics1.obsm['adj_feature'], adata_omics2.obsm['adj_feature'] = feature_graph_omics1, feature_graph_omics2 + + data = {'adata_omics1': adata_omics1, 'adata_omics2': adata_omics2} + + return data + +def clr_normalize_each_cell(adata, inplace=True): + """Normalize count vector for each cell, i.e. for each row of .X""" + + import numpy as np + import scipy + + def seurat_clr(x): + # TODO: support sparseness + s = np.sum(np.log1p(x[x > 0])) + exp = np.exp(s / len(x)) + return np.log1p(x / exp) + + if not inplace: + adata = adata.copy() + + # apply to dense or sparse matrix, along axis. returns dense matrix + adata.X = np.apply_along_axis( + seurat_clr, 1, (adata.X.A if scipy.sparse.issparse(adata.X) else np.array(adata.X)) + ) + return adata + +def construct_graph_by_feature(adata_omics1, adata_omics2, k=20, mode= "connectivity", metric="correlation", include_self=False): + feature_graph_omics1=kneighbors_graph(adata_omics1.obsm['feat'], k, mode=mode, metric=metric, include_self=include_self) + feature_graph_omics2=kneighbors_graph(adata_omics2.obsm['feat'], k, mode=mode, metric=metric, include_self=include_self) + + return feature_graph_omics1, feature_graph_omics2 + +def build_network(args, cell_position): + nbrs = NearestNeighbors(n_neighbors=args.n_neighbors+1).fit(cell_position) + _ , indices = nbrs.kneighbors(cell_position) + x = indices[:, 0].repeat(args.n_neighbors) + y = indices[:, 1:].flatten() + adj = pd.DataFrame(columns=['x', 'y', 'value']) + adj['x'] = x + adj['y'] = y + adj['value'] = np.ones(x.size) + return adj + +def construct_graph(adjacent): + n_spot = adjacent['x'].max() + 1 + adj = coo_matrix((adjacent['value'], (adjacent['x'], adjacent['y'])), shape=(n_spot, n_spot)) + return adj + +def lsi( + adata: anndata.AnnData, n_components: int = 20, + use_highly_variable: Optional[bool] = None, **kwargs + ) -> None: + r""" + LSI analysis (following the Seurat v3 approach) + """ + if use_highly_variable is None: + use_highly_variable = "highly_variable" in adata.var + adata_use = adata[:, adata.var["highly_variable"]] if use_highly_variable else adata + X = tfidf(adata_use.X) + #X = adata_use.X + X_norm = sklearn.preprocessing.Normalizer(norm="l1").fit_transform(X) + X_norm = np.log1p(X_norm * 1e4) + X_lsi = sklearn.utils.extmath.randomized_svd(X_norm, n_components, **kwargs)[0] + X_lsi -= X_lsi.mean(axis=1, keepdims=True) + X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True) + #adata.obsm["X_lsi"] = X_lsi + adata.obsm["X_lsi"] = X_lsi[:,1:] + +def tfidf(X): + r""" + TF-IDF normalization (following the Seurat v3 approach) + """ + idf = X.shape[0] / X.sum(axis=0) + if scipy.sparse.issparse(X): + tf = X.multiply(1 / X.sum(axis=1)) + return tf.multiply(idf) + else: + tf = X / X.sum(axis=1, keepdims=True) + return tf * idf + + + + + \ No newline at end of file diff --git a/projection.py b/projection.py new file mode 100644 index 0000000..4abf0df --- /dev/null +++ b/projection.py @@ -0,0 +1,78 @@ +import numpy as np +from sklearn.neighbors import kneighbors_graph +from numpy import linalg as la + +def project_func(data, Gc, n_neighbors=10, output_dim=30, Lambda=1.0): #[1919, 50], [1989, 50] + #print('data:', data) + #print(data[0].shape, data[1].shape) + #print('Gc:', Gc) + + n_datasets = len(data) #2 + H0 = [] + L = [] + for i in range(n_datasets-1): + Gc[i] = Gc[i]*np.shape(data[i])[0] + + for i in range(n_datasets): + graph_data = kneighbors_graph(data[i], n_neighbors, mode="distance") + graph_data = graph_data + graph_data.T.multiply(graph_data.T > graph_data) - \ + graph_data.multiply(graph_data.T > graph_data) + W = np.array(graph_data.todense()) + index_pos = np.where(W>0) + W[index_pos] = 1/W[index_pos] + D = np.diag(np.dot(W, np.ones(np.shape(W)[1]))) + L.append(D - W) + + #print('L:', L) + + Sigma_x = [] + Sigma_y = [] + for i in range(n_datasets-1): + Sigma_y.append(np.diag(np.dot(np.transpose(np.ones(np.shape(Gc[i])[0])), Gc[i]))) + Sigma_x.append(np.diag(np.dot(Gc[i], np.ones(np.shape(Gc[i])[1])))) + + S_xy = Gc[0] # 1919 x 1989 + S_xx = L[0] + Lambda*Sigma_x[0] # 1919 x 1919 + S_yy = L[-1] + Lambda*Sigma_y[0] # 1989 x 1989 + #for i in range(1, n_datasets-1): #如果只有两个数据集,该for循环不运行 + # print('i:', i) + # S_xy = np.vstack((S_xy, self.Gc[i])) + # S_xx = block_diag(S_xx, L[i] + self.Lambda*Sigma_x[i]) + # S_yy = S_yy + self.Lambda*Sigma_y[i] + + #print('S_xy:', S_xy.shape) + #print('S_xx:', S_xx.shape) + #print('S_yy:', S_yy.shape) + + v, Q = la.eig(S_xx) + v = v + 1e-12 + V = np.diag(v**(-0.5)) + H_x = np.dot(Q, np.dot(V, np.transpose(Q))) #new latent representation + + v, Q = la.eig(S_yy) + v = v + 1e-12 + V = np.diag(v**(-0.5)) + H_y = np.dot(Q, np.dot(V, np.transpose(Q))) #new latent representation + + H = np.dot(H_x, np.dot(S_xy, H_y)) + U, sigma, V = la.svd(H) #奇异值分解,分成三个矩阵相乘 + + num = [0] + for i in range(n_datasets-1): + num.append(num[i]+len(data[i])) + + U, V = U[:,:output_dim], np.transpose(V)[:,:output_dim] + + fx = np.dot(H_x, U) + fy = np.dot(H_y, V) + + integrated_data = [] + for i in range(n_datasets-1): + integrated_data.append(fx[num[i]:num[i+1]]) + + integrated_data.append(fy) + + #print('integrated_data:', integrated_data) + print(integrated_data[0].shape, integrated_data[1]) #[1919, 30], [1989, 30] + + return integrated_data \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..c7b4921 --- /dev/null +++ b/train.py @@ -0,0 +1,141 @@ +import torch +from model import Encoder_omics +from inits import preprocess_adj, preprocess_graph +from preprocess import construct_graph +import torch.nn.functional as F +from utils import plot_hist, plot_hist_multiple +from tqdm import tqdm +import numpy as np +from scipy.sparse import coo_matrix + +class Train: + def __init__(self, args, device, data): + self.args = args + self.device = device + self.data = data.copy() + self.adata_omics1 = self.data['adata_omics1'] + self.adata_omics2 = self.data['adata_omics2'] + + self.n_cell_omics1 = self.adata_omics1.n_obs + self.n_cell_omics2 = self.adata_omics2.n_obs + + # feature + self.features_omics1 = torch.FloatTensor(self.adata_omics1.obsm['feat'].copy()).to(self.device) + self.features_omics2 = torch.FloatTensor(self.adata_omics2.obsm['feat'].copy()).to(self.device) + + # dimension of input feature + self.args.dim_input1 = self.features_omics1.shape[1] + self.args.dim_input2 = self.features_omics2.shape[1] + self.args.dim_output1 = self.args.dim_output + self.args.dim_output2 = self.args.dim_output + + ######################################## construct spatial graph ######################################## + self.adj_spatial_omics1 = self.adata_omics1.uns['adj_spatial'] + self.adj_spatial_omics1 = construct_graph(self.adj_spatial_omics1) + self.adj_spatial_omics2 = self.adata_omics2.uns['adj_spatial'] + self.adj_spatial_omics2 = construct_graph(self.adj_spatial_omics2) + + self.adj_spatial_omics1 = self.adj_spatial_omics1.toarray() # To ensure that adjacent matrix is symmetric + self.adj_spatial_omics2 = self.adj_spatial_omics2.toarray() + + self.adj_spatial_omics1 = self.adj_spatial_omics1 + self.adj_spatial_omics1.T + self.adj_spatial_omics1 = np.where(self.adj_spatial_omics1>1, 1, self.adj_spatial_omics1) + self.adj_spatialomics2 = self.adj_spatial_omics2 + self.adj_spatial_omics2.T + self.adj_spatial_omics2 = np.where(self.adj_spatial_omics2>1, 1, self.adj_spatial_omics2) + + # convert dense matrix to sparse matrix + self.adj_spatial_omics1 = preprocess_graph(self.adj_spatial_omics1).to(self.device) # sparse adjacent matrix corresponding to spatial graph + self.adj_spatial_omics2 = preprocess_graph(self.adj_spatial_omics2).to(self.device) + + ######################################## construct feature graph ######################################## + self.adj_feature_omics1 = torch.FloatTensor(self.adata_omics1.obsm['adj_feature'].copy().toarray()) + self.adj_feature_omics2 = torch.FloatTensor(self.adata_omics2.obsm['adj_feature'].copy().toarray()) + + self.adj_feature_omics1 = self.adj_feature_omics1 + self.adj_feature_omics1.T + self.adj_feature_omics1 = np.where(self.adj_feature_omics1>1, 1, self.adj_feature_omics1) + self.adj_feature_omics2 = self.adj_feature_omics2 + self.adj_feature_omics2.T + self.adj_feature_omics2 = np.where(self.adj_feature_omics2>1, 1, self.adj_feature_omics2) + + # convert dense matrix to sparse matrix + self.adj_feature_omics1 = preprocess_graph(self.adj_feature_omics1).to(self.device) # sparse adjacent matrix corresponding to feature graph + self.adj_feature_omics2 = preprocess_graph(self.adj_feature_omics2).to(self.device) + + def train(self): + self.model = Encoder_omics(self.args.dim_input1, self.args.dim_output1, self.args.dim_input2, self.args.dim_output2).to(self.device) + self.optimizer = torch.optim.Adam(self.model.parameters(), self.args.learning_rate, + weight_decay=self.args.weight_decay) + #self.model.train() + for epoch in tqdm(range(self.args.epochs)): + self.model.train() + #self.emb1_latent_within, self.emb2_latent_within, _, self.emb1_latent_recon, self.emb2_latent_recon, \ + # self.emb_recon_omics1, self.emb_recon_omics2, self.alpha_omics_1, self.alpha_omics_2, self.alpha_omics_1_2, self.score1, self.score2 \ + results = self.model(self.features_omics1, self.features_omics2, self.adj_spatial_omics1, self.adj_feature_omics1, self.adj_spatial_omics2, self.adj_feature_omics2) + + # reconstruction loss + self.loss_recon_omics1 = F.mse_loss(self.features_omics1, results['emb_recon_omics1']) + self.loss_recon_omics2 = F.mse_loss(self.features_omics2, results['emb_recon_omics2']) + + # correspondence loss + self.loss_corre_omics1 = F.mse_loss(results['emb_latent_omics1'], results['emb_latent_omics1_across_recon']) + self.loss_corre_omics2 = F.mse_loss(results['emb_latent_omics2'], results['emb_latent_omics2_across_recon']) + + # adversial loss + self.label1 = torch.zeros(results['score_omics1'].size(0),).float().to(self.device) + self.label2 = torch.ones(results['score_omics2'].size(0),).float().to(self.device) + self.adversial_loss1 = F.mse_loss(results['score_omics1'], self.label1) + self.adversial_loss2 = F.mse_loss(results['score_omics2'], self.label2) + #self.loss_ad = 0.5*self.adversial_loss1 + 0.5*self.adversial_loss2 + self.loss_ad = self.adversial_loss1 + self.adversial_loss2 + + if self.args.datatype == 'Spatial-ATAC-RNA-seq': + loss = self.loss_recon_omics1 + 2.5*self.loss_recon_omics2 + self.loss_corre_omics1 + self.loss_corre_omics2 #+ self.loss_ad + print('self.loss_recon_omics1:', self.loss_recon_omics1) + print('self.loss_recon_omics2:', 2.5*self.loss_recon_omics2) + print('self.loss_corre_omics1:', self.loss_corre_omics1) + print('self.loss_corre_omics2:', self.loss_corre_omics2) + #print('self.loss_ad:', self.loss_ad) + print('loss:', loss) + elif self.args.datatype == 'SPOTS': + loss = self.loss_recon_omics1 + 50*self.loss_recon_omics2 + self.loss_corre_omics1 + 5*self.loss_corre_omics2 #+ self.loss_ad + print('self.loss_recon_omics1:', self.loss_recon_omics1) + print('self.loss_recon_omics2:', 50*self.loss_recon_omics2) + print('self.loss_corre_omics1:', self.loss_corre_omics1) + print('self.loss_corre_omics2:', 5*self.loss_corre_omics2) + #print('self.loss_ad:', self.loss_ad) + print('loss:', loss) + elif self.args.datatype == 'Stereo-CITE-seq': + loss = self.loss_recon_omics1 + 10*self.loss_recon_omics2 + self.loss_corre_omics1 + 10*self.loss_corre_omics2 #+ 10*self.loss_ad + print('self.loss_recon_omics1:', self.loss_recon_omics1) + print('self.loss_recon_omics2:', 10*self.loss_recon_omics2) + print('self.loss_corre_omics1:', self.loss_corre_omics1) + print('self.loss_corre_omics2:', 10*self.loss_corre_omics2) + #print('self.loss_ad:', 10*self.loss_ad) + print('loss:', loss) + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + print("Model training finished!\n") + + with torch.no_grad(): + self.model.eval() + results = self.model(self.features_omics1, self.features_omics2, self.adj_spatial_omics1, self.adj_feature_omics1, self.adj_spatial_omics2, self.adj_feature_omics2) + + emb_omics1 = F.normalize(results['emb_recon_omics1'], p=2, eps=1e-12, dim=1) + emb_omics2 = F.normalize(results['emb_recon_omics2'], p=2, eps=1e-12, dim=1) + emb_combined = F.normalize(results['emb_latent_combined'], p=2, eps=1e-12, dim=1) + + return emb_omics1.detach().cpu().numpy(), emb_omics2.detach().cpu().numpy(), emb_combined.detach().cpu().numpy(), results['alpha'].detach().cpu().numpy() + + + + + + + + + + + +