Merge branch 'v1.4.0'

wbaopaul · Jun 29, 2021 · a08a2b2 · a08a2b2
2 parents c8d777e + 33fcc1e
commit a08a2b2
Show file tree

Hide file tree

Showing 11 changed files with 266 additions and 110 deletions.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ Installation
 ------------
 
 -   Note: It is not necessary to install scATAC-pro from scratch. You can use the docker or singularity version if you prefer (see [Run scATAC-pro through docker or singularity](#run-scATAC-pro-through-docker-or-singularity) )
--   Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.3.1
+-   Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.4.0
 
 <!-- -->
 
@@ -49,8 +49,9 @@ Installation
 Updates
 ------------
 - Now provide [scATAC-pro tutorial in R](https://scatacpro-in-r.netlify.app/index.html) for access QC metrics and perform downstream analysis
-- Current version: 1.3.1
+- Current version: 1.4.0
 - Recent updates
+    * *labelTransfer*: new module added, to do label trasfer (for cell annotation) from scRNA-seq annotation. First construct a gene by cell activity matrix, then use *FindTransferAnchors* and *TransferData* function from Seurat R package to predicted cell type annotation from the cell annotaiton in scRNA-seq data.
     * *rmDoublets*: new module added, to remove potential doublets using [DoubletFinder](https://github.com/chris-mcginnis-ucsf/DoubletFinder) algorithm.
     * *qc_per_barcode*: add tss enrichment score per cell into the QC metrics
     * *call_cell*: enable filtering barcodes with minimal tss enrichment score cutoff (parameter **min_tss_escore** in the updated [configure_user.txt](configure_user.txt) file)
@@ -100,7 +101,7 @@ Dependencies
 -   trim\_galore (&gt;=0.6.3), Trimmomatic (&gt;=0.6.3)
 -   Regulratory Genomics Toolbox (RGT, for footprinting analysis)
 -   g++ compiler, bzip2, ncurses-devel
--   R packaages: devtools, flexdashboard, png, data.table, Matirx, Rcpp, ggplot2, flexmix, optparse, magrittr, readr, Seurat, bedr, gridExtra, ggrepel, kableExtra, viridis, xlsx, RColorBrewer,pheatmap,motifmatchr, chromVAR, chromVARmotifs, SummarizedExperiment, BiocParallel, DESeq2, clusterProfiler, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Mmusculus.UCSC.mm10, VisCello.atac
+-   R packaages: devtools, flexdashboard, png, data.table, Matirx, Rcpp, ggplot2, flexmix, optparse, magrittr, readr, Seurat, bedr, gridExtra, ggrepel, kableExtra, viridis, xlsx, RColorBrewer,pheatmap,motifmatchr, chromVAR, chromVARmotifs, SummarizedExperiment, BiocParallel, DESeq2, clusterProfiler, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Mmusculus.UCSC.mm10, EnsDb.Hsapiens.v86, EnsDb.Mmusculus.v79, VisCello.atac
 
 Quick start guide
 -----------
@@ -146,7 +147,7 @@ Step by step guide to running scATAC-pro
 
 -   **IMPORTANT**: you can run scATAC-pro sequentially. The input of a later analysis module is the output of the previous analysis modules. The following tutorial uses fastq files downloaded from [PBMC10k 10X Genomics](https://support.10xgenomics.com/single-cell-atac/datasets/1.1.0/atac_v1_pbmc_10k?) 
 
--   *Combine data from different sequencing lanes*
+-   *Combine data from different sequencing lanes* (not needed for 10x genomics data since v1.1.4)
 
     $ cat pbmc_fastqs/atac_pbmc_10k_v1_S1_L001_R1_001.fastq.gz pbmc_fastqs/atac_pbmc_10k_v1_S1_L002_R1_001.fastq.gz > pe1_fastq.gz
 
@@ -158,11 +159,11 @@ Step by step guide to running scATAC-pro
 
 ```
     $ scATAC-pro -s demplx_fastq 
-                 -i pe1_fastq.gz,pe2_fastq.gz,index_fastq.gz 
+                 -i pe1_fastq.gz,pe2_fastq.gz,index_fastq.gz(,other_index_fastq.gz, ...) 
                  -c configure_user.txt 
-    # or
+    # or for 10x data
     $ scATAC-pro -s demplx_fastq 
-                 -i pbmc_fastqs/ 
+                 -i pbmc_10x_fastqs/ 
                  -c configure_user.txt 
 
     $ scATAC-pro -s trimming 
@@ -219,8 +220,8 @@ Step by step guide to running scATAC-pro
                  -i output/downstream_analysis/PEAK_CALLER/CELL_CALLER/cell_cluster_table.tsv
                  -c configure_user.txt
 
-    $ scATAC-pro -s footprint ## supporting comparison two clusters, and one-vs-rest
-                 -i 0,1  ## or '0,rest' (means cluster1 vs rest) or 'one,rest' (all one-vs-rest)
+    $ scATAC-pro -s footprint ## supporting comparison two groups of cell clusters, and one-vs-rest
+                 -i 0,1  ## or '0:3,1:2' (group1 consist of cluster0,3, and group2 for cluster1,2)) or 'one,rest' (all one-vs-rest comparison)
                  -c configure_user.txt
                  
     $ scATAC-pro -s runCicero
@@ -264,6 +265,14 @@ Step by step guide to running scATAC-pro
     $ scATAC-pro -s integrate_mtx
                  -i reconstructed_mtx_file1,reconstructed_mtx_file2,(reconstructed_mtx_file3...)   
                  -c configure_user.txt
+
+
+    ## label transfer (cell annotation) from scRNA-seq
+    ## cell annotated with metadata 'Cell_Type' in seurat obj of scRNA-seq data
+    ## the gtf_file is optional
+    $ scATAC-pro -s labelTransfer
+                 -i seurat_obj_atac.rds,seurat_obj_rna.rds(,gtf_file)   
+                 -c configure_user.txt
 ```
 
 
@@ -292,7 +301,7 @@ Detailed Usage
     usage : scATAC-pro -s STEP -i INPUT -c CONFIG [-o] [-h] [-v]
     Use option -h|--help for more information
 
-    scATAC-pro 1.3.1
+    scATAC-pro 1.4.0
     ---------------
     OPTIONS
 
@@ -429,7 +438,17 @@ Detailed Usage
                          input: a bam file generated by scATAC-pro
                          output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as
                                  the input bam file)
-
+      
+          labelTransfer: label transfer (cell annotation) from scRNA-seq data
+                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format,
+                                and an optional .gtf file for gene annotation, separated by a comma. 
+                         output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and
+                                 an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac
+                                 seurat object.
+                         *Note*: the cell annotation should be given as a metadata of the seurat object of
+                               scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction 
+                               done.
+        
        -i|--input INPUT : input data, different types of input data are required for different analysis
        -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module
        [-o|--output_dir : folder to save results, default output/ under the current directory; sub-folder will be created automatically for each analysis

diff --git a/complete_update_history.md b/complete_update_history.md
@@ -1,4 +1,6 @@
 ## Complete Update History
+- Version 1.4.0 released 
+    * new module added: labelTransfer (for cell annotation) from scRNA-seq
 - Version 1.3.1 released
     * *rmDoublets*: a new module added, to remove doublets
     * *clustering*: accepts seurat obj (in .rds format) as input as well

diff --git a/configure_user.txt b/configure_user.txt
@@ -147,8 +147,10 @@ pvalue_fp = 0.05
 ## cicero cis-interaction ##
 ###########################################################################
 RUN_Cicero = TRUE
-## plot interactions within Cicero_Plot_Region on the summary report  
-Cicero_Plot_Region = chr5:140610000-140640000 ## you can also specify a gene name 
+
+## plot interactions within Cicero_Plot_Region on the summary report
+## you can also specify it as a genomic location, a gene name or none (skip the plot) 
+Cicero_Plot_Region = chr5:140610000-140640000 
 
 
 ############################################################################

diff --git a/scATAC-pro b/scATAC-pro
@@ -9,7 +9,7 @@
 #########################                                                                   
 
 SOFT="scATAC-pro"
-VERSION="1.3.1"
+VERSION="1.4.0"
 
 function usage {
     echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]"
@@ -148,6 +148,16 @@ function help {
                          input: a bam file generated by scATAC-pro
                          output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as
                                  the input bam file)"
+
+   echo "       label transfer (cell annotation) from scRNA-seq data
+                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format,                                   and an optional .gtf file for gene annotation, separated by a comma.
+                         output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and
+                                 an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac
+                                 seurat object.
+                         *Note*: the cell annotation should be given as a metadata of the seurat object of
+                                 scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction
+                                 done."
+
     echo "   -i|--input INPUT : input data, different types of input data are required for different analysis;"
     echo "   -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module"
     echo "   [-o|--output_dir : folder to save results, default 'output/' under current directory. sub-folder will be created automatically for each module"
@@ -250,7 +260,7 @@ fi
 
 ################################ check valid STEPs #####
 ############################
-AVAILABLE_STEP_ARRAY=("demplx_fastq" "trimming" "mapping" "mapping_qc" "aggr_signal" "call_peak" "recall_peak" "get_mtx" "qc_per_barcode" "call_cell" "get_bam4Cells" "clustering" "motif_analysis" "call_cnv" "runDA" "runGO" "runCicero" "split_bam" "footprint" "report" "process" "process_no_dex" "process_with_bam" "integrate" "downstream" "all" "integrate_seu" "integrate_mtx" "convert10xbam" "mergePeaks" "reConstMtx" "visualize" "addCB2bam" "rmDoublets")
+AVAILABLE_STEP_ARRAY=("demplx_fastq" "trimming" "mapping" "mapping_qc" "aggr_signal" "call_peak" "recall_peak" "get_mtx" "qc_per_barcode" "call_cell" "get_bam4Cells" "clustering" "motif_analysis" "call_cnv" "runDA" "runGO" "runCicero" "split_bam" "footprint" "report" "process" "process_no_dex" "process_with_bam" "integrate" "downstream" "all" "integrate_seu" "integrate_mtx" "convert10xbam" "mergePeaks" "reConstMtx" "visualize" "addCB2bam" "rmDoublets" "labelTransfer")
 
 
 check_s=0

diff --git a/scripts/Makefile b/scripts/Makefile
@@ -363,6 +363,16 @@ rmDoublets: config_check
 	@$(SCRIPTS)/rmDoublets.sh $(INPUT_FILE) $(CONFIG_FILE) $(CONFIG_SYS)
 	@echo 
 
+#######################################
+## labelTransfer
+#######################################
+labelTransfer: config_check
+	@echo "--------------------------------------------" 
+	@date 
+	@echo "label transfer from scRNA-seq..." 
+	@$(SCRIPTS)/labelTransfer.sh $(INPUT_FILE) $(CONFIG_FILE) $(CONFIG_SYS)
+	@echo 
+
 #######################################
 ## all
 #######################################

diff --git a/scripts/install/install_Rpackages.R b/scripts/install/install_Rpackages.R
@@ -18,7 +18,7 @@ for(pk in pks){
 }
 
 bioc.pks = c('RColorBrewer','pheatmap','motifmatchr', 'chromVAR', 'SummarizedExperiment', 'BiocParallel', 'DESeq2', 'edgeR', 'matrixStats', 'cicero', 'farver', 'BSgenome.Hsapiens.UCSC.hg38', 'BSgenome.Mmusculus.UCSC.mm10',  'clusterProfiler',
- 'DropletUtils')
+ 'DropletUtils', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79')
 
 for(pk in bioc.pks){
     if(!require(pk, character.only = T)) {  

diff --git a/scripts/labelTransfer.sh b/scripts/labelTransfer.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+
+inputs=$1  ## seurat obj,and expeted doublet rate,
+
+# reading configure file
+curr_dir=`dirname $0`
+source ${curr_dir}/read_conf.sh
+read_conf "$2"
+read_conf "$3"
+
+
+inputs=(${inputs//,/ })
+seuratObj_atac_file=${inputs[0]}
+seuratObj_rna_file=${inputs[1]}
+gtf_file=${inputs[2]}
+
+curr_dir=`dirname $0`
+
+${R_PATH}/Rscript --vanilla ${curr_dir}/src/labelTransfer.R $seuratObj_atac_file $seuratObj_rna_file $GENOME_NAME $gtf_file
+
+echo "Transfer Cell_Type label from scRNA-seq done!"
+
diff --git a/scripts/src/clustering.R b/scripts/src/clustering.R
@@ -49,15 +49,16 @@ seurat.obj = runSeurat_Atac(mtx, npc = nREDUCTION, norm_by = norm_by,
                                top_variable_features = top_variable_features, reg.var = 'nCount_ATAC')
 
 ## add qc stat to each cell
-qc_singlecell = fread(qc_stat_file)
-qc_singlecell = qc_singlecell[bc %in% colnames(seurat.obj)]
-qc_singlecell = data.frame(qc_singlecell)
-rownames(qc_singlecell) = qc_singlecell$bc
-qc_singlecell$bc = NULL
-names(qc_singlecell) =  c("total.unique.frags", "frac.mito",  "frac.peak",
-                         "frac.promoter", "frac.tss", "frac.enhancer", "tss_enrich_score")
-seurat.obj <- AddMetaData(seurat.obj, metadata = qc_singlecell)
-
+if(file.exists(qc_stat_file)){
+    qc_singlecell = fread(qc_stat_file)
+    qc_singlecell = qc_singlecell[bc %in% colnames(seurat.obj)]
+    qc_singlecell = data.frame(qc_singlecell)
+    rownames(qc_singlecell) = qc_singlecell$bc
+    qc_singlecell$bc = NULL
+    names(qc_singlecell) =  c("total.unique.frags", "frac.mito",  "frac.peak",
+                             "frac.promoter", "frac.tss", "frac.enhancer", "tss_enrich_score")
+    seurat.obj <- AddMetaData(seurat.obj, metadata = qc_singlecell)
+}
 
 
 

diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R
@@ -566,7 +566,7 @@ do_DA <- function(mtx_score, clusters, test = 'wilcox',
   }
   return(res)
 }
-
+do_DA = cmpfun(do_DA)
 
 #fg_genes: vector of forground genes
 #bg_genes: background genes
@@ -827,6 +827,7 @@ normalize_gene_activities.corrected <- function (activity_matrices, cell_num_gen
   }
   return(ret)
 }
+normalize_gene_activities.corrected = cmpfun(normalize_gene_activities.corrected)
 
 # do cicero given a Seurat object, output gene activity score
 doCicero_gascore <- function(seurat.obj, reduction = 'tsne', chr_sizes,
@@ -899,7 +900,7 @@ doCicero_gascore <- function(seurat.obj, reduction = 'tsne', chr_sizes,
   res = list('conns' = conns, 'ga_score' = cicero_gene_activities)
   return(res)  
 }
-
+doCicero_gascore = cmpfun(doCicero_gascore)
 
 # do cicero given a Seurat object, just return the connection 
 doCicero_conn <- function(seurat.obj, reduction = 'tsne', 
@@ -948,7 +949,7 @@ doCicero_conn <- function(seurat.obj, reduction = 'tsne',
   conns = conns[coaccess > coaccess_thr, ]
   return(conns)
 }
-
+doCicero_conn = cmpfun(doCicero_conn)
 
 ## change rowname of zscores (tf name) from chromvar to be readable
 readable_tf <- function(sele.zscores, GENOME_NAME){
@@ -1095,6 +1096,7 @@ runDiffMotifEnrich <- function(mtx_score, clusters, test = 'wilcox',
   }
   return(res)
 }
+runDiffMotifEnrich = cmpfun(runDiffMotifEnrich)
 
 ## mtx_objs: a list of matrix 
 run_integration <- function(mtx_list, integrate_by = 'VFACS',
@@ -1208,7 +1210,7 @@ run_integration <- function(mtx_list, integrate_by = 'VFACS',
 
   return(seurat.obj)
 }
-
+run_integration = cmpfun(run_integration)   
 
 # Find doublets
 FindDoublets_Atac <- function(seurat.atac, PCs = 1:50, 
@@ -1258,4 +1260,61 @@ FindDoublets_Atac <- function(seurat.atac, PCs = 1:50,
   seurat.atac$Doublet_Singlet = seurat.rna$Doublet_Singlet
   return(seurat.atac)
 }
+FindDoublets_Atac = cmpfun(FindDoublets_Atac)
+
+# generate gene activity matrix for labelTransfer
+generate_gene_cisActivity <- function(gene_ann, mtx, include_body = T,
+                                      dist_to_tss = 2000){
+  ## generating gene cis activity score
+  ## input: gene_ann (data table with chr, gene_start, gene_end, strand, gene_name), 
+  ##        and atac matrix file
+
+  ## 1. select gene up/down-stream regions (promoter with/without gene_body) ##
+
+
+  if(!include_body){
+    gene_ann[, 'start' := ifelse(strand == '+', gene_start - dist_to_tss, gene_end - dist_to_tss)]
+    gene_ann[, 'end' := ifelse(strand == '+', gene_start + dist_to_tss, gene_end + dist_to_tss)]
+  }else{
+    gene_ann[, 'start' := ifelse(strand == '+', gene_start - dist_to_tss, gene_start)]
+    gene_ann[, 'end' := ifelse(strand == '+', gene_end, gene_end + dist_to_tss)]
+
+  }
+
+  gene_ann = subset(gene_ann, select = c(chr, start, end, gene_name))
+
+
+  ## 2. read mtx file ##
+
+  rnames = rownames(mtx)
+  chrs = sapply(rnames, function(x) unlist(strsplit(x, '-'))[1])
+  starts = sapply(rnames, function(x) unlist(strsplit(x, '-'))[2])
+  ends = sapply(rnames, function(x) unlist(strsplit(x, '-'))[3])
+
+  peaks = data.table('chr' = chrs, 'start' = as.integer(starts), 
+                     'end' = as.integer(ends))
+  setkey(peaks, chr, start, end)
+  peaks[, 'pname' := paste(chr, start, end, sep = '-')]
+  over.ids = foverlaps(gene_ann, peaks, by.x = c('chr', 'start', 'end'),
+                       by.y = c('chr', 'start', 'end'), which = T)
+  over.ids[, 'gene_name' := gene_ann[xid, gene_name]]
+  over.ids[, 'pname' := peaks[yid, pname]]
+  over.ids = over.ids[complete.cases(over.ids)]
+
+
+  smtx = sparseMatrix(i = over.ids$xid, j = over.ids$yid,
+                      dimnames = list(gene_ann$gene_name[1:max(over.ids$xid)],
+                                      peaks$pname[1:max(over.ids$yid)]))
+
+  mtx = mtx[rownames(mtx) %in% colnames(smtx), ]
+  smtx = smtx[, rownames(mtx)]
+
+  activity.matrix = smtx %*% mtx
+  rs = Matrix::rowSums(activity.matrix)
+  activity.matrix = activity.matrix[rs > 10, ]
+
+  return(activity.matrix)
+
+}
 
+generate_gene_cisActivity = cmpfun(generate_gene_cisActivity)