From 7e4412f09ff9cc8dbe4b466ccade4df8de5657d7 Mon Sep 17 00:00:00 2001
From: wbaopaul <yuw1@email.chop.edu>
Date: Wed, 26 May 2021 09:45:39 -0400
Subject: [PATCH 1/8] v1.4.0 planned

---
 README.md                  | 6 +++---
 complete_update_history.md | 2 ++
 scATAC-pro                 | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index dcf9363..b6013e4 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ Installation
 ------------
 
 -   Note: It is not necessary to install scATAC-pro from scratch. You can use the docker or singularity version if you prefer (see [Run scATAC-pro through docker or singularity](#run-scATAC-pro-through-docker-or-singularity) )
--   Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.3.1
+-   Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.4.0
 
 <!-- -->
 
@@ -49,7 +49,7 @@ Installation
 Updates
 ------------
 - Now provide [scATAC-pro tutorial in R](https://scatacpro-in-r.netlify.app/index.html) for access QC metrics and perform downstream analysis
-- Current version: 1.3.1
+- Current version: 1.4.0
 - Recent updates
     * *rmDoublets*: new module added, to remove potential doublets using [DoubletFinder](https://github.com/chris-mcginnis-ucsf/DoubletFinder) algorithm.
     * *qc_per_barcode*: add tss enrichment score per cell into the QC metrics
@@ -292,7 +292,7 @@ Detailed Usage
     usage : scATAC-pro -s STEP -i INPUT -c CONFIG [-o] [-h] [-v]
     Use option -h|--help for more information
 
-    scATAC-pro 1.3.1
+    scATAC-pro 1.4.0
     ---------------
     OPTIONS
 
diff --git a/complete_update_history.md b/complete_update_history.md
index 991502a..d8b5374 100644
--- a/complete_update_history.md
+++ b/complete_update_history.md
@@ -1,4 +1,6 @@
 ## Complete Update History
+- Version 1.4.0 
+    * new module planned: cellAnnotate from singleR or from scRNA-seq
 - Version 1.3.1 released
     * *rmDoublets*: a new module added, to remove doublets
     * *clustering*: accepts seurat obj (in .rds format) as input as well
diff --git a/scATAC-pro b/scATAC-pro
index 0b43c00..77055a3 100755
--- a/scATAC-pro
+++ b/scATAC-pro
@@ -9,7 +9,7 @@
 #########################                                                                   
 
 SOFT="scATAC-pro"
-VERSION="1.3.1"
+VERSION="1.4.0"
 
 function usage {
     echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]"

From 979f7d1704982f5c5a3e3308992c107d93b0ece6 Mon Sep 17 00:00:00 2001
From: wbaopaul <yuw1@email.chop.edu>
Date: Wed, 26 May 2021 09:54:17 -0400
Subject: [PATCH 2/8] v1.4.0 planned

---
 scATAC-pro | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scATAC-pro b/scATAC-pro
index 77055a3..0b43c00 100755
--- a/scATAC-pro
+++ b/scATAC-pro
@@ -9,7 +9,7 @@
 #########################                                                                   
 
 SOFT="scATAC-pro"
-VERSION="1.4.0"
+VERSION="1.3.1"
 
 function usage {
     echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]"

From b1d474ce48cb36097cd9e63cedae02207a3b16de Mon Sep 17 00:00:00 2001
From: wbaopaul <yuw1@email.chop.edu>
Date: Thu, 10 Jun 2021 09:48:50 -0400
Subject: [PATCH 3/8] make add qc metrics to seurat obj optional

---
 scripts/src/clustering.R | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/scripts/src/clustering.R b/scripts/src/clustering.R
index c2e182e..0b36ea9 100644
--- a/scripts/src/clustering.R
+++ b/scripts/src/clustering.R
@@ -49,15 +49,16 @@ seurat.obj = runSeurat_Atac(mtx, npc = nREDUCTION, norm_by = norm_by,
                                top_variable_features = top_variable_features, reg.var = 'nCount_ATAC')
 
 ## add qc stat to each cell
-qc_singlecell = fread(qc_stat_file)
-qc_singlecell = qc_singlecell[bc %in% colnames(seurat.obj)]
-qc_singlecell = data.frame(qc_singlecell)
-rownames(qc_singlecell) = qc_singlecell$bc
-qc_singlecell$bc = NULL
-names(qc_singlecell) =  c("total.unique.frags", "frac.mito",  "frac.peak",
-                         "frac.promoter", "frac.tss", "frac.enhancer", "tss_enrich_score")
-seurat.obj <- AddMetaData(seurat.obj, metadata = qc_singlecell)
-
+if(file.exists(qc_stat_file)){
+    qc_singlecell = fread(qc_stat_file)
+    qc_singlecell = qc_singlecell[bc %in% colnames(seurat.obj)]
+    qc_singlecell = data.frame(qc_singlecell)
+    rownames(qc_singlecell) = qc_singlecell$bc
+    qc_singlecell$bc = NULL
+    names(qc_singlecell) =  c("total.unique.frags", "frac.mito",  "frac.peak",
+                             "frac.promoter", "frac.tss", "frac.enhancer", "tss_enrich_score")
+    seurat.obj <- AddMetaData(seurat.obj, metadata = qc_singlecell)
+}
 
 
 

From 2e0aa7c253c08b75cc632753446279fee5258f6c Mon Sep 17 00:00:00 2001
From: wbaopaul <yuw1@email.chop.edu>
Date: Thu, 24 Jun 2021 09:52:09 -0400
Subject: [PATCH 4/8] make promoter/enhance ann optional

---
 configure_user.txt                |   6 +-
 scATAC-pro                        |   2 +-
 scripts/src/get_qc_per_barcode.R  |  68 ++++++++++-----
 scripts/src/scATAC-pro_report.Rmd | 138 ++++++++++++++++--------------
 4 files changed, 126 insertions(+), 88 deletions(-)

diff --git a/configure_user.txt b/configure_user.txt
index a39ffec..a5cbe84 100644
--- a/configure_user.txt
+++ b/configure_user.txt
@@ -147,8 +147,10 @@ pvalue_fp = 0.05
 ## cicero cis-interaction ##
 ###########################################################################
 RUN_Cicero = TRUE
-## plot interactions within Cicero_Plot_Region on the summary report  
-Cicero_Plot_Region = chr5:140610000-140640000 ## you can also specify a gene name 
+
+## plot interactions within Cicero_Plot_Region on the summary report
+## you can also specify it as a genomic location, a gene name or none (skip the plot) 
+Cicero_Plot_Region = chr5:140610000-140640000 
 
 
 ############################################################################
diff --git a/scATAC-pro b/scATAC-pro
index 0b43c00..77055a3 100755
--- a/scATAC-pro
+++ b/scATAC-pro
@@ -9,7 +9,7 @@
 #########################                                                                   
 
 SOFT="scATAC-pro"
-VERSION="1.3.1"
+VERSION="1.4.0"
 
 function usage {
     echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]"
diff --git a/scripts/src/get_qc_per_barcode.R b/scripts/src/get_qc_per_barcode.R
index 308d6c7..48a951e 100644
--- a/scripts/src/get_qc_per_barcode.R
+++ b/scripts/src/get_qc_per_barcode.R
@@ -109,6 +109,7 @@ sourceCpp(code='
 )
 
 
+
 args = commandArgs(T)
 frags.file = args[1]
 peaks.file = args[2]
@@ -128,17 +129,23 @@ frags = frags[!grepl(chr, pattern = 'random', ignore.case = T)]
 frags = frags[!grepl(chr, pattern ='un', ignore.case = T)]
 
 peaks = fread(peaks.file, select=1:3, header = F)
-promoters = fread(promoters.file, select=1:3, header = F)
 tss = fread(tss.file, select=1:3, header = F)
-enhs = fread(enhs.file, select=1:3, header = F)
-names(peaks) = names(promoters) = names(tss) =
-  names(enhs) = c('chr', 'start', 'end')
+names(peaks) = names(tss) = c('chr', 'start', 'end')
 
+## promoters.file and enhs.file are optional
+if(file.exists(promoters.file)) {
+    promoters = fread(promoters.file, select=1:3, header = F)
+    names(promoters) =  c('chr', 'start', 'end')
+    setkey(promoters, chr, start)
+}
+if(file.exists(enhs.file)) {
+    enhs = fread(enhs.file, select=1:3, header = F)
+    names(enhs) =  c('chr', 'start', 'end')
+    setkey(enhs, chr, start)
+}
 
 setkey(peaks, chr, start)
-setkey(promoters, chr, start)
 setkey(tss, chr, start)
-setkey(enhs, chr, start)
 
 chrs = unique(frags$chr)
 
@@ -187,10 +194,8 @@ tss[, 'end' := end + 1000]
 fragsInRegion = NULL
 for(chr0 in chrs){
   peaks0 = peaks[chr == chr0]
-  promoters0 = promoters[chr == chr0]
 
   tss0 = tss[chr == chr0]
-  enhs0 = enhs[chr == chr0]
   frags0 = frags[chr == chr0]
   frags = frags[chr != chr0]
   if(nrow(peaks0) == 0){
@@ -199,23 +204,30 @@ for(chr0 in chrs){
     frags0[, 'peaks' := getOverlaps_read2AnyRegion(frags0, peaks0)]
   }
   
-  if(nrow(promoters0) == 0){
-    frags0[, 'promoters' := 0]
-  }else{
-    frags0[, 'promoters' := getOverlaps_read2AnyRegion(frags0, promoters0)]
-  }
-  
   if(nrow(tss0) == 0){
     frags0[, 'tss' := 0]
   }else{
     frags0[, 'tss' := getOverlaps_read2AnyRegion(frags0, tss0)]
   }
-  
-  if(nrow(enhs0) == 0){
-    frags0[, 'enhs' := 0]
-  }else{
-    frags0[, 'enhs' := getOverlaps_read2AnyRegion(frags0, enhs0)]
+
+  if(file.exists(promoters.file)) {
+    promoters0 = promoters[chr == chr0]
+    if(nrow(promoters0) == 0){
+        frags0[, 'promoters' := 0]
+    }else{
+        frags0[, 'promoters' := getOverlaps_read2AnyRegion(frags0, promoters0)]
+    }
   }
+  
+  
+  if(file.exists(enhs.file)){ 
+    enhs0 = enhs[chr == chr0]
+    if(nrow(enhs0) == 0){
+        frags0[, 'enhs' := 0]
+    }else{
+        frags0[, 'enhs' := getOverlaps_read2AnyRegion(frags0, enhs0)]
+    }
+  } 
  
   
   fragsInRegion = rbind(fragsInRegion, frags0)
@@ -230,12 +242,22 @@ fragsInRegion[, 'frac_mito' := sum(isMito)/total_frags, by = bc]
 fragsInRegion[, 'isMito' := NULL]
 fragsInRegion[, 'frac_peak' := sum(peaks)/total_frags, by = bc]
 fragsInRegion[, 'peaks' := NULL]
-fragsInRegion[, 'frac_promoter' := sum(promoters)/total_frags, by = bc]
-fragsInRegion[, 'promoters' := NULL]
 fragsInRegion[, 'frac_tss' := sum(tss)/total_frags, by = bc]
 fragsInRegion[, 'tss' := NULL]
-fragsInRegion[, 'frac_enhancer' := sum(enhs)/total_frags, by = bc]
-fragsInRegion[, 'enhs' := NULL]
+
+if(file.exists(promoters.file)) {
+    fragsInRegion[, 'frac_promoter' := sum(promoters)/total_frags, by = bc]
+    fragsInRegion[, 'promoters' := NULL]
+}else{
+    fragsInRegion[, 'frac_promoter' := 0]
+}
+
+if(file.exists(enhs.file)) {
+    fragsInRegion[, 'frac_enhancer' := sum(enhs)/total_frags, by = bc]
+    fragsInRegion[, 'enhs' := NULL]
+}else{
+    fragsInRegion[, 'frac_enhancer' := 0]
+}
 fragsInRegion = unique(fragsInRegion)
 fragsInRegion$tss_enrich_score = escores[fragsInRegion$bc]
 
diff --git a/scripts/src/scATAC-pro_report.Rmd b/scripts/src/scATAC-pro_report.Rmd
index cc9398a..fbc2f4c 100644
--- a/scripts/src/scATAC-pro_report.Rmd
+++ b/scripts/src/scATAC-pro_report.Rmd
@@ -710,80 +710,94 @@ if(file.exists(footprint_stats.file)){
 ### Predicted Interactions at a locus of interest
 ```{r, out.width='\\textwidth'}
 cicero_conn.file = paste0(down.dir, '/cicero_interactions.txt')
-
-if(file.exists(cicero_conn.file)){
+err = 0
+if(Cicero_Plot_Region != 'none' & file.exists(cicero_conn.file)){
  conns = fread(cicero_conn.file)
  temp <- tempfile()
 if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {
-    download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp)
- }
+    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp),
+    error = function(e) {
+                    print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz!")
+      return(1)})
+}
 
 if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
-    download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp)
- }
+    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp),
+    error = function(e) {
+                    print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!")
+      return(1)})
+}
 
 if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
-    download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp)
+    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp),
+    error = function(e) {
+                    print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!")
+      return(1)})
 }
 
 if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
-    download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp)
+    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp),
+    error = function(e) {
+                    print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!")
+      return(1)})
 }
-
-gene_anno <- rtracklayer::readGFF(temp)
- unlink(temp)
  
- # rename some columns to match requirements
- gene_anno$chromosome <- paste0("chr", gene_anno$seqid)
- gene_anno$gene <- gene_anno$gene_id
- gene_anno$transcript <- gene_anno$transcript_id
- gene_anno$symbol <- gene_anno$gene_name
- gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, 
-                                          transcript, gene, symbol))
- gene_anno = gene_anno[complete.cases(gene_anno), ]
- if(Cicero_Plot_Region %in% gene_anno$symbol){
-    tmp_anno = gene_anno[gene_anno$symbol == Cicero_Plot_Region, ]
-    chr0 = tmp_anno$chr[1]
-    start0 = min(tmp_anno$start)
-    end0 = max(tmp_anno$end)
- }else{
-    chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000
-    region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2]
-    start0 = as.integer(unlist(strsplit(region0, '-'))[1])
-    end0 = as.integer(unlist(strsplit(region0, '-'))[2])
- }
-
-if(end0 < start0 + 100000){
-    start0 = floor((start0+end0)/2) - 50000
-    end0 = start0 + 100000
-}
- conns[, 'chr1' := unlist(strsplit(Peak1, '_'))[1], by = Peak1] 
- conns[, 'start1' := as.integer(unlist(strsplit(Peak1, '_'))[2]), by = Peak1] 
- conns[, 'end2' := as.integer(unlist(strsplit(Peak2, '_'))[3]), by = Peak2] 
-conns = conns[start1 <= end2]
-conns0 = conns[chr1==chr0 & start1 >= start0 & end2 <= end0]
-
-conns[, c('chr1', 'start1', 'end2') := NULL]
-conns = data.frame(conns)
-if(nrow(conns0)==0) cat('No interaction found in the griven region!')
-
- if(plotEPS & nrow(conns0) >0 ){
-    pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans')
-    plot_connections(conns, chr0, start0, end0,
-                 gene_model = gene_anno, 
-                 coaccess_cutoff = .3, 
-                 connection_width = 1, 
-                 collapseTranscripts = "longest",
-                 viewpoint_alpha = 0)
-   dev.off()
-  
- }
-  if(nrow(conns0) > 0)  plot_connections(conns, chr0, start0, end0,
-                 gene_model = gene_anno, 
-                 coaccess_cutoff = .3, 
-                 connection_width = 1, 
-                 collapseTranscripts = "longest",
-                 viewpoint_alpha = 0)
+if(err == 0){
+    gene_anno <- rtracklayer::readGFF(temp)
+     unlink(temp)
+     
+     # rename some columns to match requirements
+     gene_anno$chromosome <- paste0("chr", gene_anno$seqid)
+     gene_anno$gene <- gene_anno$gene_id
+     gene_anno$transcript <- gene_anno$transcript_id
+     gene_anno$symbol <- gene_anno$gene_name
+     gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, 
+                                              transcript, gene, symbol))
+     gene_anno = gene_anno[complete.cases(gene_anno), ]
+     if(Cicero_Plot_Region %in% gene_anno$symbol){
+        tmp_anno = gene_anno[gene_anno$symbol == Cicero_Plot_Region, ]
+        chr0 = tmp_anno$chr[1]
+        start0 = min(tmp_anno$start)
+        end0 = max(tmp_anno$end)
+     }else{
+        chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000
+        region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2]
+        start0 = as.integer(unlist(strsplit(region0, '-'))[1])
+        end0 = as.integer(unlist(strsplit(region0, '-'))[2])
+     }
+
+    if(end0 < start0 + 100000){
+        start0 = floor((start0+end0)/2) - 50000
+        end0 = start0 + 100000
+    }
+     conns[, 'chr1' := unlist(strsplit(Peak1, '_'))[1], by = Peak1] 
+     conns[, 'start1' := as.integer(unlist(strsplit(Peak1, '_'))[2]), by = Peak1] 
+     conns[, 'end2' := as.integer(unlist(strsplit(Peak2, '_'))[3]), by = Peak2] 
+    conns = conns[start1 <= end2]
+    conns0 = conns[chr1==chr0 & start1 >= start0 & end2 <= end0]
+
+    conns[, c('chr1', 'start1', 'end2') := NULL]
+    conns = data.frame(conns)
+    if(nrow(conns0)==0) cat('No interaction found in the griven region!')
+
+     if(plotEPS & nrow(conns0) >0 ){
+        pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans')
+        plot_connections(conns, chr0, start0, end0,
+                     gene_model = gene_anno, 
+                     coaccess_cutoff = .3, 
+                     connection_width = 1, 
+                     collapseTranscripts = "longest",
+                     viewpoint_alpha = 0)
+       dev.off()
+      
+     }
+      if(nrow(conns0) > 0)  plot_connections(conns, chr0, start0, end0,
+                     gene_model = gene_anno, 
+                     coaccess_cutoff = .3, 
+                     connection_width = 1, 
+                     collapseTranscripts = "longest",
+                     viewpoint_alpha = 0)
+    }
 }
 
 ```

From f1d10500dc8f0ae275cadaab8305167b7f3429c0 Mon Sep 17 00:00:00 2001
From: wbaopaul <yuw1@email.chop.edu>
Date: Thu, 24 Jun 2021 23:14:50 -0400
Subject: [PATCH 5/8] add labelTransfer module for cell annotation

---
 README.md                          | 16 ++++-
 complete_update_history.md         |  2 +-
 scATAC-pro                         | 11 +++-
 scripts/Makefile                   | 10 +++
 scripts/labelTransfer.sh           | 23 +++++++
 scripts/src/dsAnalysis_utilities.R | 81 ++++++++++++++++++++++--
 scripts/src/labelTransfer.R        | 98 ++++++++++++++++++++++++++++++
 7 files changed, 234 insertions(+), 7 deletions(-)
 create mode 100755 scripts/labelTransfer.sh
 create mode 100644 scripts/src/labelTransfer.R

diff --git a/README.md b/README.md
index b6013e4..21de7d2 100644
--- a/README.md
+++ b/README.md
@@ -264,6 +264,12 @@ Step by step guide to running scATAC-pro
     $ scATAC-pro -s integrate_mtx
                  -i reconstructed_mtx_file1,reconstructed_mtx_file2,(reconstructed_mtx_file3...)   
                  -c configure_user.txt
+
+    ## label transfer (cell annotation) from scRNA-seq
+    ## cell annotated with metadata 'Cell_Type' in seurat obj of scRNA-seq data
+    $ scATAC-pro -s labelTransfer
+                 -i seurat_obj_atac.rds,seurat_obj_rna.rds(,gtf_file)   
+                 -c configure_user.txt
 ```
 
 
@@ -429,7 +435,15 @@ Detailed Usage
                          input: a bam file generated by scATAC-pro
                          output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as
                                  the input bam file)
-
+      
+          labelTransfer: label transfer (cell annotation) from scRNA-seq data
+                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data,                                and an optional gtf file for gene annotation, separated by a comma. 
+                         output: a updated seurat object added with the Predicted_Cell_Type as a metadat,
+                                 saved in the same directory as the input atac seurat object.
+                         Note: the cell annotation should be given as a metadata of the seurat object of
+                               scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction 
+                               done.
+        
        -i|--input INPUT : input data, different types of input data are required for different analysis
        -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module
        [-o|--output_dir : folder to save results, default output/ under the current directory; sub-folder will be created automatically for each analysis
diff --git a/complete_update_history.md b/complete_update_history.md
index d8b5374..2ed1945 100644
--- a/complete_update_history.md
+++ b/complete_update_history.md
@@ -1,6 +1,6 @@
 ## Complete Update History
 - Version 1.4.0 
-    * new module planned: cellAnnotate from singleR or from scRNA-seq
+    * new module planned: labelTransfer from scRNA-seq
 - Version 1.3.1 released
     * *rmDoublets*: a new module added, to remove doublets
     * *clustering*: accepts seurat obj (in .rds format) as input as well
diff --git a/scATAC-pro b/scATAC-pro
index 77055a3..31e777a 100755
--- a/scATAC-pro
+++ b/scATAC-pro
@@ -148,6 +148,15 @@ function help {
                          input: a bam file generated by scATAC-pro
                          output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as
                                  the input bam file)"
+
+    echo "      labelTransfer: label transfer (cell annotation) from scRNA-seq data
+                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data,                                and an optional gtf file for gene annotation, separated by a comma. 
+                         output: a updated seurat object added with the Predicted_Cell_Type as a metadat,
+                                 saved in the same directory as the input atac seurat object.
+                         Note: the cell annotation should be given as a metadata of the seurat object of
+                               scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction 
+                               done."
+
     echo "   -i|--input INPUT : input data, different types of input data are required for different analysis;"
     echo "   -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module"
     echo "   [-o|--output_dir : folder to save results, default 'output/' under current directory. sub-folder will be created automatically for each module"
@@ -250,7 +259,7 @@ fi
 
 ################################ check valid STEPs #####
 ############################
-AVAILABLE_STEP_ARRAY=("demplx_fastq" "trimming" "mapping" "mapping_qc" "aggr_signal" "call_peak" "recall_peak" "get_mtx" "qc_per_barcode" "call_cell" "get_bam4Cells" "clustering" "motif_analysis" "call_cnv" "runDA" "runGO" "runCicero" "split_bam" "footprint" "report" "process" "process_no_dex" "process_with_bam" "integrate" "downstream" "all" "integrate_seu" "integrate_mtx" "convert10xbam" "mergePeaks" "reConstMtx" "visualize" "addCB2bam" "rmDoublets")
+AVAILABLE_STEP_ARRAY=("demplx_fastq" "trimming" "mapping" "mapping_qc" "aggr_signal" "call_peak" "recall_peak" "get_mtx" "qc_per_barcode" "call_cell" "get_bam4Cells" "clustering" "motif_analysis" "call_cnv" "runDA" "runGO" "runCicero" "split_bam" "footprint" "report" "process" "process_no_dex" "process_with_bam" "integrate" "downstream" "all" "integrate_seu" "integrate_mtx" "convert10xbam" "mergePeaks" "reConstMtx" "visualize" "addCB2bam" "rmDoublets" "labelTransfer")
 
 
 check_s=0
diff --git a/scripts/Makefile b/scripts/Makefile
index 2310003..c13b50d 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -363,6 +363,16 @@ rmDoublets: config_check
 	@$(SCRIPTS)/rmDoublets.sh $(INPUT_FILE) $(CONFIG_FILE) $(CONFIG_SYS)
 	@echo 
 
+#######################################
+## labelTransfer
+#######################################
+labelTransfer: config_check
+	@echo "--------------------------------------------" 
+	@date 
+	@echo "label transfer from scRNA-seq..." 
+	@$(SCRIPTS)/labelTransfer.sh $(INPUT_FILE) $(CONFIG_FILE) $(CONFIG_SYS)
+	@echo 
+
 #######################################
 ## all
 #######################################
diff --git a/scripts/labelTransfer.sh b/scripts/labelTransfer.sh
new file mode 100755
index 0000000..73075dd
--- /dev/null
+++ b/scripts/labelTransfer.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+
+inputs=$1  ## seurat obj,and expeted doublet rate,
+ 
+# reading configure file
+curr_dir=`dirname $0`
+source ${curr_dir}/read_conf.sh
+read_conf "$2"
+read_conf "$3"
+
+
+inputs=(${inputs//,/ })
+seuratObj_atac_file=${inputs[0]}
+seuratObj_rna_file=${inputs[1]}
+gtf_file=${inputs[2]}
+
+curr_dir=`dirname $0`
+
+${R_PATH}/Rscript --vanilla ${curr_dir}/src/labelTransfer.R $seuratObj_atac_file $seuratObj_rna_file $GENOME_NAME $gtf_file
+
+echo "Transfer Cell_Type label from scRNA-seq done!"
+
diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R
index 792bf2e..0d097f5 100644
--- a/scripts/src/dsAnalysis_utilities.R
+++ b/scripts/src/dsAnalysis_utilities.R
@@ -566,7 +566,7 @@ do_DA <- function(mtx_score, clusters, test = 'wilcox',
   }
   return(res)
 }
-
+do_DA = cmpfun(do_DA)
 
 #fg_genes: vector of forground genes
 #bg_genes: background genes
@@ -827,6 +827,7 @@ normalize_gene_activities.corrected <- function (activity_matrices, cell_num_gen
   }
   return(ret)
 }
+normalize_gene_activities.corrected = cmpfun(normalize_gene_activities.corrected)
 
 # do cicero given a Seurat object, output gene activity score
 doCicero_gascore <- function(seurat.obj, reduction = 'tsne', chr_sizes,
@@ -899,7 +900,7 @@ doCicero_gascore <- function(seurat.obj, reduction = 'tsne', chr_sizes,
   res = list('conns' = conns, 'ga_score' = cicero_gene_activities)
   return(res)  
 }
-
+doCicero_gascore = cmpfun(doCicero_gascore)
 
 # do cicero given a Seurat object, just return the connection 
 doCicero_conn <- function(seurat.obj, reduction = 'tsne', 
@@ -948,7 +949,7 @@ doCicero_conn <- function(seurat.obj, reduction = 'tsne',
   conns = conns[coaccess > coaccess_thr, ]
   return(conns)
 }
-
+doCicero_conn = cmpfun(doCicero_conn)
 
 ## change rowname of zscores (tf name) from chromvar to be readable
 readable_tf <- function(sele.zscores, GENOME_NAME){
@@ -1095,6 +1096,7 @@ runDiffMotifEnrich <- function(mtx_score, clusters, test = 'wilcox',
   }
   return(res)
 }
+runDiffMotifEnrich = cmpfun(runDiffMotifEnrich)
 
 ## mtx_objs: a list of matrix 
 run_integration <- function(mtx_list, integrate_by = 'VFACS',
@@ -1208,7 +1210,7 @@ run_integration <- function(mtx_list, integrate_by = 'VFACS',
   
   return(seurat.obj)
 }
-   
+run_integration = cmpfun(run_integration)   
 
 # Find doublets
 FindDoublets_Atac <- function(seurat.atac, PCs = 1:50, 
@@ -1258,4 +1260,75 @@ FindDoublets_Atac <- function(seurat.atac, PCs = 1:50,
   seurat.atac$Doublet_Singlet = seurat.rna$Doublet_Singlet
   return(seurat.atac)
 }
+FindDoublets_Atac = cmpfun(FindDoublets_Atac)
+
+# generate gene activity matrix for labelTransfer
+generate_gene_cisActivity <- function(gene_gtf, mtx, include_body = T,
+                                      dist_to_tss = 2000){
+  ## generating gene cis activity score
+  ## input: gene gtf file, and atac matrix file
+  
+  ## 1. select gene up/down-stream regions (promoter with/without gene_body) ##
+  
+  gene_ann = fread(gene_gtf, sep = '\t')
+  gene_ann = gene_ann[V3 == 'gene']
+  gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9]
+  gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name]
+  gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name]
+  names(gene_ann)[1] = 'chr'
+  gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name))
+  chrs = 1:22
+  chrs = c(chrs, 'X', 'Y')
+  gene_ann = gene_ann[chr %in% chrs]
+  gene_ann = gene_ann[!duplicated(gene_name)]
+  
+  
+  names(gene_ann)[2:4] = c('ss', 'ee', 'strand')
+  if(!include_body){
+    gene_ann[, 'start' := ifelse(strand == '+', ss - dist_to_tss, ee - dist_to_tss)]
+    gene_ann[, 'end' := ifelse(strand == '+', ss + dist_to_tss, ee + dist_to_tss)]
+  }else{
+    gene_ann[, 'start' := ifelse(strand == '+', ss - dist_to_tss, ss)]
+    gene_ann[, 'end' := ifelse(strand == '+', ee, ee + dist_to_tss)]
+    
+  }
+  
+  gene_ann[, 'chr' := paste0('chr', chr)]
+  
+  gene_ann = subset(gene_ann, select = c(chr, start, end, gene_name))
+  
+  
+  ## 2. read mtx file ##
+  
+  rnames = rownames(mtx)
+  chrs = sapply(rnames, function(x) unlist(strsplit(x, '-'))[1])
+  starts = sapply(rnames, function(x) unlist(strsplit(x, '-'))[2])
+  ends = sapply(rnames, function(x) unlist(strsplit(x, '-'))[3])
+  
+  peaks = data.table('chr' = chrs, 'start' = as.integer(starts), 
+                     'end' = as.integer(ends))
+  setkey(peaks, chr, start, end)
+  peaks[, 'pname' := paste(chr, start, end, sep = '-')]
+  over.ids = foverlaps(gene_ann, peaks, by.x = c('chr', 'start', 'end'),
+                       by.y = c('chr', 'start', 'end'), which = T)
+  over.ids[, 'gene_name' := gene_ann[xid, gene_name]]
+  over.ids[, 'pname' := peaks[yid, pname]]
+  over.ids = over.ids[complete.cases(over.ids)]
+  
+  
+  smtx = sparseMatrix(i = over.ids$xid, j = over.ids$yid,
+                      dimnames = list(gene_ann$gene_name[1:max(over.ids$xid)],
+                                      peaks$pname[1:max(over.ids$yid)]))
+  
+  mtx = mtx[rownames(mtx) %in% colnames(smtx), ]
+  smtx = smtx[, rownames(mtx)]
+  
+  activity.matrix = smtx %*% mtx
+  rs = Matrix::rowSums(activity.matrix)
+  activity.matrix = activity.matrix[rs > 10, ]
+  
+  return(activity.matrix)
+  
+}
 
+generate_gene_cisActivity = cmpfun(generate_gene_cisActivity)
\ No newline at end of file
diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R
new file mode 100644
index 0000000..375dccd
--- /dev/null
+++ b/scripts/src/labelTransfer.R
@@ -0,0 +1,98 @@
+source_local <- function(fname){
+  argv <- commandArgs(trailingOnly = FALSE)
+  base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
+  source(paste(base_dir, fname, sep="/"))
+}
+
+source_local('dsAnalysis_utilities.R')
+
+args = commandArgs(T)
+inputSeurat_atac = args[1] ## a seurat.rds file, with pca conducted and metadata Cell_Type 
+inputSeurat_rna = args[2] ## a seurat.rds file, with pca conducted
+GENOME_NAME = args[3]
+gene_gtf_file = args[4]
+
+## download gtf file if not provided
+if(!file.exists(gene_gtf_file)){
+  print('gene annotation gtf file not provided, I will try to download one:')
+  err = 0
+  if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
+    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp),
+                    error = function(e) {
+                      print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!")
+                      return(1)})
+  }
+  
+  if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
+    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp),
+                    error = function(e) {
+                      print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!")
+                      return(1)})
+  }
+  
+  if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
+    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp),
+                    error = function(e) {
+                      print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!")
+                      return(1)})
+  }
+  if(err == 1) stop('Download failed! Please provide a gtf file to run this module!')
+}
+
+seurat.rna = readRDS(inputSeurat_rna)
+seurat.atac = readRDS(inputSeurat_atac)
+
+atac.mtx = seurat.atac@assays$ATAC@counts
+rn = rownames(atac.mtx)
+rownames(atac.mtx) <- sapply(rn, function(x) unlist(strsplit(x, ','))[1])
+activity.matrix = generate_gene_cisActivity(gene_gtf = gene_gtf_file,
+                                            atac.mtx, 
+                                            include_body = T)
+
+seurat.atac[["ACTIVITY"]] <- CreateAssayObject(counts = activity.matrix)
+
+DefaultAssay(seurat.atac) <- "ACTIVITY"
+seurat.atac <- NormalizeData(seurat.atac)
+seurat.atac <- ScaleData(seurat.atac, features = rownames(seurat.atac))
+seurat.atac <- FindVariableFeatures(seurat.atac)
+
+DefaultAssay(seurat.atac) <- "ATAC"
+seurat.atac$tech = 'ATAC'
+seurat.rna$tech = 'RNA'
+
+## transfer label 
+genes4anchors = VariableFeatures(object = seurat.rna)
+#genes4anchors = NULL
+transfer.anchors <- FindTransferAnchors(reference = seurat.rna,
+                                        query = seurat.atac,
+                                        features = genes4anchors,
+                                        reference.assay = "RNA",
+                                        query.assay = "ACTIVITY",
+                                        reduction = "cca",
+                                        k.anchor = 5)
+
+
+celltype.predictions <- TransferData(anchorset = transfer.anchors,
+                                     refdata = seurat.rna$Cell_Type,
+                                     weight.reduction = seurat.atac[["pca"]],
+                                     dims = 1:ncol(seurat.atac[["pca"]]),
+                                     k.weight = 50)
+celltype.predictions = subset(celltype.predictions, select = c('predicted.id', 'prediction.score.max'))
+names(celltype.predictions)[1] = 'Predicted_Cell_Type'
+seurat.atac <- AddMetaData(seurat.atac, metadata = celltype.predictions)
+rm(transfer.anchors)
+
+p1 <- DimPlot(seurat.atac, group.by = "Predicted_Cell_Type",
+              label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") 
+
+
+seurat.atac[["ACTIVITY"]] <- NULL ## don't save activity assay
+
+outputPath = dirname(inputSeurat_atac)
+outputName = basename(inputSeurat_atac)
+saveRDS(seurat.atac, file = paste0(outputPath, '/updated_', outputName))
+
+ggsave(p1, filename = paste0(outputPath, '/umap_with_predicted_cell_type.eps'),
+       device = 'eps', height = 6, width = 7)
+
+

From 89ebe82727201317417be8c6b0c29de1398fd6c5 Mon Sep 17 00:00:00 2001
From: wbaopaul <yuw1@email.chop.edu>
Date: Fri, 25 Jun 2021 15:06:22 -0400
Subject: [PATCH 6/8] update readme for v1.4.0

---
 README.md                          | 24 ++++++++++++++----------
 complete_update_history.md         |  4 ++--
 scATAC-pro                         | 15 ++++++++-------
 scripts/src/dsAnalysis_utilities.R |  2 +-
 scripts/src/labelTransfer.R        | 10 +++++++---
 5 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 21de7d2..3c653dc 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,7 @@ Updates
 - Now provide [scATAC-pro tutorial in R](https://scatacpro-in-r.netlify.app/index.html) for access QC metrics and perform downstream analysis
 - Current version: 1.4.0
 - Recent updates
+    * *labelTransfer*: new module added, to do label trasfer (for cell annotation) from scRNA-seq annotation. First construct a gene by cell activity matrix, then use *FindTransferAnchors* and *TransferData* function from Seurat R package to predicted cell type annotation from the cell annotaiton in scRNA-seq data.
     * *rmDoublets*: new module added, to remove potential doublets using [DoubletFinder](https://github.com/chris-mcginnis-ucsf/DoubletFinder) algorithm.
     * *qc_per_barcode*: add tss enrichment score per cell into the QC metrics
     * *call_cell*: enable filtering barcodes with minimal tss enrichment score cutoff (parameter **min_tss_escore** in the updated [configure_user.txt](configure_user.txt) file)
@@ -146,7 +147,7 @@ Step by step guide to running scATAC-pro
 
 -   **IMPORTANT**: you can run scATAC-pro sequentially. The input of a later analysis module is the output of the previous analysis modules. The following tutorial uses fastq files downloaded from [PBMC10k 10X Genomics](https://support.10xgenomics.com/single-cell-atac/datasets/1.1.0/atac_v1_pbmc_10k?) 
 
--   *Combine data from different sequencing lanes*
+-   *Combine data from different sequencing lanes* (not needed for 10x genomics data since v1.1.4)
 
     $ cat pbmc_fastqs/atac_pbmc_10k_v1_S1_L001_R1_001.fastq.gz pbmc_fastqs/atac_pbmc_10k_v1_S1_L002_R1_001.fastq.gz > pe1_fastq.gz
 
@@ -158,11 +159,11 @@ Step by step guide to running scATAC-pro
 
 ```
     $ scATAC-pro -s demplx_fastq 
-                 -i pe1_fastq.gz,pe2_fastq.gz,index_fastq.gz 
+                 -i pe1_fastq.gz,pe2_fastq.gz,index_fastq.gz(,other_index_fastq.gz, ...) 
                  -c configure_user.txt 
-    # or
+    # or for 10x data
     $ scATAC-pro -s demplx_fastq 
-                 -i pbmc_fastqs/ 
+                 -i pbmc_10x_fastqs/ 
                  -c configure_user.txt 
 
     $ scATAC-pro -s trimming 
@@ -219,8 +220,8 @@ Step by step guide to running scATAC-pro
                  -i output/downstream_analysis/PEAK_CALLER/CELL_CALLER/cell_cluster_table.tsv
                  -c configure_user.txt
 
-    $ scATAC-pro -s footprint ## supporting comparison two clusters, and one-vs-rest
-                 -i 0,1  ## or '0,rest' (means cluster1 vs rest) or 'one,rest' (all one-vs-rest)
+    $ scATAC-pro -s footprint ## supporting comparison two groups of cell clusters, and one-vs-rest
+                 -i 0,1  ## or '0:3,1:2' (group1 consist of cluster0,3, and group2 for cluster1,2)) or 'one,rest' (all one-vs-rest comparison)
                  -c configure_user.txt
                  
     $ scATAC-pro -s runCicero
@@ -265,8 +266,10 @@ Step by step guide to running scATAC-pro
                  -i reconstructed_mtx_file1,reconstructed_mtx_file2,(reconstructed_mtx_file3...)   
                  -c configure_user.txt
 
+
     ## label transfer (cell annotation) from scRNA-seq
     ## cell annotated with metadata 'Cell_Type' in seurat obj of scRNA-seq data
+    ## the gtf_file is optional
     $ scATAC-pro -s labelTransfer
                  -i seurat_obj_atac.rds,seurat_obj_rna.rds(,gtf_file)   
                  -c configure_user.txt
@@ -437,10 +440,11 @@ Detailed Usage
                                  the input bam file)
       
           labelTransfer: label transfer (cell annotation) from scRNA-seq data
-                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data,                                and an optional gtf file for gene annotation, separated by a comma. 
-                         output: a updated seurat object added with the Predicted_Cell_Type as a metadat,
-                                 saved in the same directory as the input atac seurat object.
-                         Note: the cell annotation should be given as a metadata of the seurat object of
+                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format,                                   and an optional .gtf file for gene annotation, separated by a comma. 
+                         output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and
+                                 an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac
+                                 seurat object.
+                         *Note*: the cell annotation should be given as a metadata of the seurat object of
                                scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction 
                                done.
         
diff --git a/complete_update_history.md b/complete_update_history.md
index 2ed1945..2119089 100644
--- a/complete_update_history.md
+++ b/complete_update_history.md
@@ -1,6 +1,6 @@
 ## Complete Update History
-- Version 1.4.0 
-    * new module planned: labelTransfer from scRNA-seq
+- Version 1.4.0 released 
+    * new module added: labelTransfer (for cell annotation) from scRNA-seq
 - Version 1.3.1 released
     * *rmDoublets*: a new module added, to remove doublets
     * *clustering*: accepts seurat obj (in .rds format) as input as well
diff --git a/scATAC-pro b/scATAC-pro
index 31e777a..0175a62 100755
--- a/scATAC-pro
+++ b/scATAC-pro
@@ -149,13 +149,14 @@ function help {
                          output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as
                                  the input bam file)"
 
-    echo "      labelTransfer: label transfer (cell annotation) from scRNA-seq data
-                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data,                                and an optional gtf file for gene annotation, separated by a comma. 
-                         output: a updated seurat object added with the Predicted_Cell_Type as a metadat,
-                                 saved in the same directory as the input atac seurat object.
-                         Note: the cell annotation should be given as a metadata of the seurat object of
-                               scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction 
-                               done."
+   echo "       label transfer (cell annotation) from scRNA-seq data
+                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format,                                   and an optional .gtf file for gene annotation, separated by a comma.
+                         output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and
+                                 an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac
+                                 seurat object.
+                         *Note*: the cell annotation should be given as a metadata of the seurat object of
+                                 scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction
+                                 done."
 
     echo "   -i|--input INPUT : input data, different types of input data are required for different analysis;"
     echo "   -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module"
diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R
index 0d097f5..f3fa66b 100644
--- a/scripts/src/dsAnalysis_utilities.R
+++ b/scripts/src/dsAnalysis_utilities.R
@@ -1331,4 +1331,4 @@ generate_gene_cisActivity <- function(gene_gtf, mtx, include_body = T,
   
 }
 
-generate_gene_cisActivity = cmpfun(generate_gene_cisActivity)
\ No newline at end of file
+generate_gene_cisActivity = cmpfun(generate_gene_cisActivity)
diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R
index 375dccd..c1f0050 100644
--- a/scripts/src/labelTransfer.R
+++ b/scripts/src/labelTransfer.R
@@ -83,8 +83,10 @@ seurat.atac <- AddMetaData(seurat.atac, metadata = celltype.predictions)
 rm(transfer.anchors)
 
 p1 <- DimPlot(seurat.atac, group.by = "Predicted_Cell_Type",
-              label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") 
+              label = TRUE, repel = TRUE) + ggtitle("scATAC-seq") 
 
+p2 <- DimPlot(seurat.rna, group.by = "Cell_Type",
+              label = TRUE, repel = TRUE) + ggtitle("scRNA-seq") 
 
 seurat.atac[["ACTIVITY"]] <- NULL ## don't save activity assay
 
@@ -92,7 +94,9 @@ outputPath = dirname(inputSeurat_atac)
 outputName = basename(inputSeurat_atac)
 saveRDS(seurat.atac, file = paste0(outputPath, '/updated_', outputName))
 
-ggsave(p1, filename = paste0(outputPath, '/umap_with_predicted_cell_type.eps'),
-       device = 'eps', height = 6, width = 7)
+pcomb = gridExtra::grid.arrange(p1, p2, nrow = 1)
+
+ggsave(pcomb, filename = paste0(outputPath, '/umap_with_predicted_cell_type.eps'),
+       device = 'eps', height = 6, width = 13)
 
 

From 217d296ab03b596ceaf08e405c8f1ac9a26a2261 Mon Sep 17 00:00:00 2001
From: wbaopaul <yuw1@email.chop.edu>
Date: Tue, 29 Jun 2021 11:31:57 -0400
Subject: [PATCH 7/8] test v1.4.0

---
 README.md                          |   3 +-
 scripts/src/dsAnalysis_utilities.R |  28 ++----
 scripts/src/labelTransfer.R        | 116 ++++++++++++++++++----
 scripts/src/scATAC-pro_report.Rmd  | 152 ++++++++++++++---------------
 4 files changed, 181 insertions(+), 118 deletions(-)

diff --git a/README.md b/README.md
index 3c653dc..ef28ddf 100644
--- a/README.md
+++ b/README.md
@@ -440,7 +440,8 @@ Detailed Usage
                                  the input bam file)
       
           labelTransfer: label transfer (cell annotation) from scRNA-seq data
-                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format,                                   and an optional .gtf file for gene annotation, separated by a comma. 
+                         input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format,
+                                and an optional .gtf file for gene annotation, separated by a comma. 
                          output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and
                                  an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac
                                  seurat object.
diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R
index f3fa66b..ea68aaf 100644
--- a/scripts/src/dsAnalysis_utilities.R
+++ b/scripts/src/dsAnalysis_utilities.R
@@ -1263,38 +1263,24 @@ FindDoublets_Atac <- function(seurat.atac, PCs = 1:50,
 FindDoublets_Atac = cmpfun(FindDoublets_Atac)
 
 # generate gene activity matrix for labelTransfer
-generate_gene_cisActivity <- function(gene_gtf, mtx, include_body = T,
+generate_gene_cisActivity <- function(gene_ann, mtx, include_body = T,
                                       dist_to_tss = 2000){
   ## generating gene cis activity score
-  ## input: gene gtf file, and atac matrix file
+  ## input: gene_ann (data table with chr, gene_start, gene_end, strand, gene_name), 
+  ##        and atac matrix file
   
   ## 1. select gene up/down-stream regions (promoter with/without gene_body) ##
   
-  gene_ann = fread(gene_gtf, sep = '\t')
-  gene_ann = gene_ann[V3 == 'gene']
-  gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9]
-  gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name]
-  gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name]
-  names(gene_ann)[1] = 'chr'
-  gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name))
-  chrs = 1:22
-  chrs = c(chrs, 'X', 'Y')
-  gene_ann = gene_ann[chr %in% chrs]
-  gene_ann = gene_ann[!duplicated(gene_name)]
   
-  
-  names(gene_ann)[2:4] = c('ss', 'ee', 'strand')
   if(!include_body){
-    gene_ann[, 'start' := ifelse(strand == '+', ss - dist_to_tss, ee - dist_to_tss)]
-    gene_ann[, 'end' := ifelse(strand == '+', ss + dist_to_tss, ee + dist_to_tss)]
+    gene_ann[, 'start' := ifelse(strand == '+', gene_start - dist_to_tss, gene_end - dist_to_tss)]
+    gene_ann[, 'end' := ifelse(strand == '+', gene_start + dist_to_tss, gene_end + dist_to_tss)]
   }else{
-    gene_ann[, 'start' := ifelse(strand == '+', ss - dist_to_tss, ss)]
-    gene_ann[, 'end' := ifelse(strand == '+', ee, ee + dist_to_tss)]
+    gene_ann[, 'start' := ifelse(strand == '+', gene_start - dist_to_tss, gene_start)]
+    gene_ann[, 'end' := ifelse(strand == '+', gene_end, gene_end + dist_to_tss)]
     
   }
   
-  gene_ann[, 'chr' := paste0('chr', chr)]
-  
   gene_ann = subset(gene_ann, select = c(chr, start, end, gene_name))
   
   
diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R
index c1f0050..7ce81bf 100644
--- a/scripts/src/labelTransfer.R
+++ b/scripts/src/labelTransfer.R
@@ -12,31 +12,113 @@ inputSeurat_rna = args[2] ## a seurat.rds file, with pca conducted
 GENOME_NAME = args[3]
 gene_gtf_file = args[4]
 
-## download gtf file if not provided
+## if gtf file not provided, using R bioconductor packages for gene annotation
 if(!file.exists(gene_gtf_file)){
-  print('gene annotation gtf file not provided, I will try to download one:')
-  err = 0
+  if(!grepl(GENOME_NAME, pattern = 'hg19|hg38|mm10|mm9', ignore.case = T)){
+    stop('Genome is not belong to hg19,hg38,mm9 and mm10, 
+         please provide .gtf file for gene annotation!')
+  }
+  if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {
+    library(EnsDb.Mmusculus.v79)
+    ens.ann <- EnsDb.Mmusculus.v79
+  }
   if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
-    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp),
-                    error = function(e) {
-                      print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!")
-                      return(1)})
+    library(EnsDb.Mmusculus.v75)
+    ens.ann <- EnsDb.Mmusculus.v75
   }
   
   if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
-    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp),
-                    error = function(e) {
-                      print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!")
-                      return(1)})
+    library(EnsDb.Hsapiens.v86)
+    ens.ann <- EnsDb.Hsapiens.v86
   }
   
   if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
-    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp),
-                    error = function(e) {
-                      print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!")
-                      return(1)})
+    library(EnsDb.Hsapiens.v75)
+    ens.ann <- EnsDb.Hsapiens.v75
   }
-  if(err == 1) stop('Download failed! Please provide a gtf file to run this module!')
+  
+  gene_ann <- ensembldb::genes(ens.ann)
+  gene_ann <- keepStandardChromosomes(gene_ann, pruning.mode = 'coarse')
+  gene_ann = data.frame(gene_ann)
+  gene_ann = subset(gene_ann, gene_biotype %in% c('protein_coding', 'miRNA', 'lincRNA'),
+                    select = c('seqnames', 'start', 'end',
+                               'strand', 'gene_biotype', 'gene_name'))
+  
+  names(gene_ann)[1:3] = c('chr', 'gene_start', 'gene_end')
+  gene_ann = data.table(gene_ann)
+  gene_ann = gene_ann[!duplicated(gene_name)]
+  gene_ann$chr = paste0('chr', gene_ann$chr)
+  gene_ann = subset(gene_ann, select = c('chr', 'gene_start', 'gene_end',
+                                         'strand', 'gene_name'))
+  
+  gene_ann[chr == 'chrMT']$chr = 'chrM'
+  
+}else{
+  gene_ann = fread(gene_gtf_file, sep = '\t')
+  gene_ann = gene_ann[V3 == 'gene']
+  gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9]
+  gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name]
+  gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name]
+  names(gene_ann)[1] = 'chr'
+  gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name))
+  chrs = 1:22
+  chrs = c(chrs, 'X', 'Y', 'M')
+  gene_ann = gene_ann[chr %in% chrs]
+  gene_ann = gene_ann[!duplicated(gene_name)]
+  names(gene_ann)[2:4] = c('gene_start', 'gene_end', 'strand')
+  gene_ann[, 'chr' := paste0('chr', chr)]
+  
+}
+
+
+if(F){
+  ## download gtf file if not provided
+  if(!file.exists(gene_gtf_file)){
+    print('gene annotation gtf file not provided, I will try to download one:')
+    err = 0
+    if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
+      err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp),
+                      error = function(e) {
+                        print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!")
+                        return(1)})
+    }
+    if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {
+      err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp),
+                             error = function(e) {
+                               print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz!")
+                               return(1)})
+    }
+    if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
+      err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp),
+                      error = function(e) {
+                        print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!")
+                        return(1)})
+    }
+    
+    if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
+      err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp),
+                      error = function(e) {
+                        print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!")
+                        return(1)})
+    }
+    
+    if(err == 1) stop('Download failed! Please provide a gtf file to run this module!')
+  }
+  
+  gene_ann = fread(gene_gtf_file, sep = '\t')
+  gene_ann = gene_ann[V3 == 'gene']
+  gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9]
+  gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name]
+  gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name]
+  names(gene_ann)[1] = 'chr'
+  gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name))
+  chrs = 1:22
+  chrs = c(chrs, 'X', 'Y')
+  gene_ann = gene_ann[chr %in% chrs]
+  gene_ann = gene_ann[!duplicated(gene_name)]
+  names(gene_ann)[2:4] = c('gene_start', 'gene_end', 'strand')
+  gene_ann[, 'chr' := paste0('chr', chr)]
+  
 }
 
 seurat.rna = readRDS(inputSeurat_rna)
@@ -45,7 +127,7 @@ seurat.atac = readRDS(inputSeurat_atac)
 atac.mtx = seurat.atac@assays$ATAC@counts
 rn = rownames(atac.mtx)
 rownames(atac.mtx) <- sapply(rn, function(x) unlist(strsplit(x, ','))[1])
-activity.matrix = generate_gene_cisActivity(gene_gtf = gene_gtf_file,
+activity.matrix = generate_gene_cisActivity(gene_ann = gene_ann,
                                             atac.mtx, 
                                             include_body = T)
 
diff --git a/scripts/src/scATAC-pro_report.Rmd b/scripts/src/scATAC-pro_report.Rmd
index fbc2f4c..13cec8a 100644
--- a/scripts/src/scATAC-pro_report.Rmd
+++ b/scripts/src/scATAC-pro_report.Rmd
@@ -710,94 +710,88 @@ if(file.exists(footprint_stats.file)){
 ### Predicted Interactions at a locus of interest
 ```{r, out.width='\\textwidth'}
 cicero_conn.file = paste0(down.dir, '/cicero_interactions.txt')
-err = 0
 if(Cicero_Plot_Region != 'none' & file.exists(cicero_conn.file)){
  conns = fread(cicero_conn.file)
  temp <- tempfile()
-if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {
-    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp),
-    error = function(e) {
-                    print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz!")
-      return(1)})
-}
-
-if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
-    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp),
-    error = function(e) {
-                    print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!")
-      return(1)})
-}
-
-if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
-    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp),
-    error = function(e) {
-                    print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!")
-      return(1)})
-}
-
-if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
-    err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp),
-    error = function(e) {
-                    print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!")
-      return(1)})
-}
+  if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {
+    library(EnsDb.Mmusculus.v79)
+    ens.ann <- EnsDb.Mmusculus.v79
+  }
+  if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
+    library(EnsDb.Mmusculus.v75)
+    ens.ann <- EnsDb.Mmusculus.v75
+  }
+  
+  if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
+    library(EnsDb.Hsapiens.v86)
+    ens.ann <- EnsDb.Hsapiens.v86
+  }
+  
+  if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
+    library(EnsDb.Hsapiens.v75)
+    ens.ann <- EnsDb.Hsapiens.v75
+  }
+  
+  gene_anno <- transcripts(ens.ann,
+                          columns = c("gene_id", "gene_name",  
+                                      "tx_biotype", "tx_id"))
  
-if(err == 0){
-    gene_anno <- rtracklayer::readGFF(temp)
-     unlink(temp)
-     
-     # rename some columns to match requirements
-     gene_anno$chromosome <- paste0("chr", gene_anno$seqid)
-     gene_anno$gene <- gene_anno$gene_id
-     gene_anno$transcript <- gene_anno$transcript_id
-     gene_anno$symbol <- gene_anno$gene_name
-     gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, 
-                                              transcript, gene, symbol))
-     gene_anno = gene_anno[complete.cases(gene_anno), ]
-     if(Cicero_Plot_Region %in% gene_anno$symbol){
-        tmp_anno = gene_anno[gene_anno$symbol == Cicero_Plot_Region, ]
-        chr0 = tmp_anno$chr[1]
-        start0 = min(tmp_anno$start)
-        end0 = max(tmp_anno$end)
-     }else{
-        chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000
-        region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2]
-        start0 = as.integer(unlist(strsplit(region0, '-'))[1])
-        end0 = as.integer(unlist(strsplit(region0, '-'))[2])
-     }
-
-    if(end0 < start0 + 100000){
-        start0 = floor((start0+end0)/2) - 50000
-        end0 = start0 + 100000
-    }
-     conns[, 'chr1' := unlist(strsplit(Peak1, '_'))[1], by = Peak1] 
-     conns[, 'start1' := as.integer(unlist(strsplit(Peak1, '_'))[2]), by = Peak1] 
-     conns[, 'end2' := as.integer(unlist(strsplit(Peak2, '_'))[3]), by = Peak2] 
-    conns = conns[start1 <= end2]
-    conns0 = conns[chr1==chr0 & start1 >= start0 & end2 <= end0]
-
-    conns[, c('chr1', 'start1', 'end2') := NULL]
-    conns = data.frame(conns)
-    if(nrow(conns0)==0) cat('No interaction found in the griven region!')
-
-     if(plotEPS & nrow(conns0) >0 ){
-        pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans')
-        plot_connections(conns, chr0, start0, end0,
-                     gene_model = gene_anno, 
-                     coaccess_cutoff = .3, 
-                     connection_width = 1, 
-                     collapseTranscripts = "longest",
-                     viewpoint_alpha = 0)
-       dev.off()
-      
-     }
-      if(nrow(conns0) > 0)  plot_connections(conns, chr0, start0, end0,
+  gene_anno = keepStandardChromosomes(gene_anno,pruning.mode = 'coarse')
+ 
+  gene_anno = data.table(data.frame(gene_anno))
+  gene_anno = subset(gene_anno, tx_biotype %in% c('protein_coding', 'miRNA', 'lincRNA'), select = c('seqnames', 'start', 'end', 'tx_id',
+                       'strand', 'gene_id', 'gene_name'))
+  #rename some columns to match requirements
+  names(gene_anno)[c(1, 4, 6:7)] = c('chromosome', 'transcript', 'gene', 'symbol')
+  gene_anno$chromosome = paste0('chr', gene_anno$chromosome)
+
+  gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, 
+                                           transcript, gene, symbol))
+  gene_anno = gene_anno[complete.cases(gene_anno), ]
+  if(Cicero_Plot_Region %in% gene_anno$symbol){
+    tmp_anno = gene_anno[gene_anno$symbol == Cicero_Plot_Region, ]
+    chr0 = tmp_anno$chr[1]
+    start0 = min(tmp_anno$start)
+    end0 = max(tmp_anno$end)
+  }else{
+    chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000
+    region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2]
+    start0 = as.integer(unlist(strsplit(region0, '-'))[1])
+    end0 = as.integer(unlist(strsplit(region0, '-'))[2])
+  }
+  
+  if(end0 < start0 + 100000){
+    start0 = floor((start0+end0)/2) - 50000
+    end0 = start0 + 100000
+  }
+  conns[, 'chr1' := unlist(strsplit(Peak1, '_'))[1], by = Peak1] 
+  conns[, 'start1' := as.integer(unlist(strsplit(Peak1, '_'))[2]), by = Peak1] 
+  conns[, 'end2' := as.integer(unlist(strsplit(Peak2, '_'))[3]), by = Peak2] 
+  conns = conns[start1 <= end2]
+  conns0 = conns[chr1==chr0 & start1 >= start0 & end2 <= end0]
+  
+  conns[, c('chr1', 'start1', 'end2') := NULL]
+  conns = data.frame(conns)
+  if(nrow(conns0)==0) cat('No interaction found in the griven region!')
+  
+  if(plotEPS & nrow(conns0) >0 ){
+    pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans')
+    plot_connections(conns, chr0, start0, end0,
                      gene_model = gene_anno, 
                      coaccess_cutoff = .3, 
                      connection_width = 1, 
                      collapseTranscripts = "longest",
                      viewpoint_alpha = 0)
-    }
+    dev.off()
+    
+  }
+  if(nrow(conns0) > 0)  plot_connections(conns, chr0, start0, end0,
+                                         gene_model = gene_anno, 
+                                         coaccess_cutoff = .3, 
+                                         connection_width = 1, 
+                                         collapseTranscripts = "longest",
+                                         viewpoint_alpha = 0)
 }
 
+
 ```

From 33fcc1e62609406f9ba51ae1645ae943b01661e6 Mon Sep 17 00:00:00 2001
From: wbaopaul <yuw1@email.chop.edu>
Date: Tue, 29 Jun 2021 11:52:00 -0400
Subject: [PATCH 8/8] tested v1.4.0

---
 README.md                           | 2 +-
 scripts/install/install_Rpackages.R | 2 +-
 scripts/src/labelTransfer.R         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index ef28ddf..7654fc1 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ Dependencies
 -   trim\_galore (&gt;=0.6.3), Trimmomatic (&gt;=0.6.3)
 -   Regulratory Genomics Toolbox (RGT, for footprinting analysis)
 -   g++ compiler, bzip2, ncurses-devel
--   R packaages: devtools, flexdashboard, png, data.table, Matirx, Rcpp, ggplot2, flexmix, optparse, magrittr, readr, Seurat, bedr, gridExtra, ggrepel, kableExtra, viridis, xlsx, RColorBrewer,pheatmap,motifmatchr, chromVAR, chromVARmotifs, SummarizedExperiment, BiocParallel, DESeq2, clusterProfiler, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Mmusculus.UCSC.mm10, VisCello.atac
+-   R packaages: devtools, flexdashboard, png, data.table, Matirx, Rcpp, ggplot2, flexmix, optparse, magrittr, readr, Seurat, bedr, gridExtra, ggrepel, kableExtra, viridis, xlsx, RColorBrewer,pheatmap,motifmatchr, chromVAR, chromVARmotifs, SummarizedExperiment, BiocParallel, DESeq2, clusterProfiler, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Mmusculus.UCSC.mm10, EnsDb.Hsapiens.v86, EnsDb.Mmusculus.v79, VisCello.atac
 
 Quick start guide
 -----------
diff --git a/scripts/install/install_Rpackages.R b/scripts/install/install_Rpackages.R
index 9b91bff..7955ec4 100644
--- a/scripts/install/install_Rpackages.R
+++ b/scripts/install/install_Rpackages.R
@@ -18,7 +18,7 @@ for(pk in pks){
 }
 
 bioc.pks = c('RColorBrewer','pheatmap','motifmatchr', 'chromVAR', 'SummarizedExperiment', 'BiocParallel', 'DESeq2', 'edgeR', 'matrixStats', 'cicero', 'farver', 'BSgenome.Hsapiens.UCSC.hg38', 'BSgenome.Mmusculus.UCSC.mm10',  'clusterProfiler',
- 'DropletUtils')
+ 'DropletUtils', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79')
 
 for(pk in bioc.pks){
     if(!require(pk, character.only = T)) {  
diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R
index 7ce81bf..69ec8fd 100644
--- a/scripts/src/labelTransfer.R
+++ b/scripts/src/labelTransfer.R
@@ -15,7 +15,7 @@ gene_gtf_file = args[4]
 ## if gtf file not provided, using R bioconductor packages for gene annotation
 if(!file.exists(gene_gtf_file)){
   if(!grepl(GENOME_NAME, pattern = 'hg19|hg38|mm10|mm9', ignore.case = T)){
-    stop('Genome is not belong to hg19,hg38,mm9 and mm10, 
+    stop('Genome is not belong to any of hg19,hg38,mm9 or mm10, 
          please provide .gtf file for gene annotation!')
   }
   if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {