From 7e4412f09ff9cc8dbe4b466ccade4df8de5657d7 Mon Sep 17 00:00:00 2001 From: wbaopaul Date: Wed, 26 May 2021 09:45:39 -0400 Subject: [PATCH 1/8] v1.4.0 planned --- README.md | 6 +++--- complete_update_history.md | 2 ++ scATAC-pro | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index dcf9363..b6013e4 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Installation ------------ - Note: It is not necessary to install scATAC-pro from scratch. You can use the docker or singularity version if you prefer (see [Run scATAC-pro through docker or singularity](#run-scATAC-pro-through-docker-or-singularity) ) -- Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.3.1 +- Run the following command in your terminal, scATAC-pro will be installed in YOUR\_INSTALL\_PATH/scATAC-pro\_1.4.0 @@ -49,7 +49,7 @@ Installation Updates ------------ - Now provide [scATAC-pro tutorial in R](https://scatacpro-in-r.netlify.app/index.html) for access QC metrics and perform downstream analysis -- Current version: 1.3.1 +- Current version: 1.4.0 - Recent updates * *rmDoublets*: new module added, to remove potential doublets using [DoubletFinder](https://github.com/chris-mcginnis-ucsf/DoubletFinder) algorithm. * *qc_per_barcode*: add tss enrichment score per cell into the QC metrics @@ -292,7 +292,7 @@ Detailed Usage usage : scATAC-pro -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] Use option -h|--help for more information - scATAC-pro 1.3.1 + scATAC-pro 1.4.0 --------------- OPTIONS diff --git a/complete_update_history.md b/complete_update_history.md index 991502a..d8b5374 100644 --- a/complete_update_history.md +++ b/complete_update_history.md @@ -1,4 +1,6 @@ ## Complete Update History +- Version 1.4.0 + * new module planned: cellAnnotate from singleR or from scRNA-seq - Version 1.3.1 released * *rmDoublets*: a new module added, to remove doublets * *clustering*: accepts seurat obj (in .rds format) as input as well diff --git a/scATAC-pro b/scATAC-pro index 0b43c00..77055a3 100755 --- a/scATAC-pro +++ b/scATAC-pro @@ -9,7 +9,7 @@ ######################### SOFT="scATAC-pro" -VERSION="1.3.1" +VERSION="1.4.0" function usage { echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]" From 979f7d1704982f5c5a3e3308992c107d93b0ece6 Mon Sep 17 00:00:00 2001 From: wbaopaul Date: Wed, 26 May 2021 09:54:17 -0400 Subject: [PATCH 2/8] v1.4.0 planned --- scATAC-pro | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scATAC-pro b/scATAC-pro index 77055a3..0b43c00 100755 --- a/scATAC-pro +++ b/scATAC-pro @@ -9,7 +9,7 @@ ######################### SOFT="scATAC-pro" -VERSION="1.4.0" +VERSION="1.3.1" function usage { echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]" From b1d474ce48cb36097cd9e63cedae02207a3b16de Mon Sep 17 00:00:00 2001 From: wbaopaul Date: Thu, 10 Jun 2021 09:48:50 -0400 Subject: [PATCH 3/8] make add qc metrics to seurat obj optional --- scripts/src/clustering.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/src/clustering.R b/scripts/src/clustering.R index c2e182e..0b36ea9 100644 --- a/scripts/src/clustering.R +++ b/scripts/src/clustering.R @@ -49,15 +49,16 @@ seurat.obj = runSeurat_Atac(mtx, npc = nREDUCTION, norm_by = norm_by, top_variable_features = top_variable_features, reg.var = 'nCount_ATAC') ## add qc stat to each cell -qc_singlecell = fread(qc_stat_file) -qc_singlecell = qc_singlecell[bc %in% colnames(seurat.obj)] -qc_singlecell = data.frame(qc_singlecell) -rownames(qc_singlecell) = qc_singlecell$bc -qc_singlecell$bc = NULL -names(qc_singlecell) = c("total.unique.frags", "frac.mito", "frac.peak", - "frac.promoter", "frac.tss", "frac.enhancer", "tss_enrich_score") -seurat.obj <- AddMetaData(seurat.obj, metadata = qc_singlecell) - +if(file.exists(qc_stat_file)){ + qc_singlecell = fread(qc_stat_file) + qc_singlecell = qc_singlecell[bc %in% colnames(seurat.obj)] + qc_singlecell = data.frame(qc_singlecell) + rownames(qc_singlecell) = qc_singlecell$bc + qc_singlecell$bc = NULL + names(qc_singlecell) = c("total.unique.frags", "frac.mito", "frac.peak", + "frac.promoter", "frac.tss", "frac.enhancer", "tss_enrich_score") + seurat.obj <- AddMetaData(seurat.obj, metadata = qc_singlecell) +} From 2e0aa7c253c08b75cc632753446279fee5258f6c Mon Sep 17 00:00:00 2001 From: wbaopaul Date: Thu, 24 Jun 2021 09:52:09 -0400 Subject: [PATCH 4/8] make promoter/enhance ann optional --- configure_user.txt | 6 +- scATAC-pro | 2 +- scripts/src/get_qc_per_barcode.R | 68 ++++++++++----- scripts/src/scATAC-pro_report.Rmd | 138 ++++++++++++++++-------------- 4 files changed, 126 insertions(+), 88 deletions(-) diff --git a/configure_user.txt b/configure_user.txt index a39ffec..a5cbe84 100644 --- a/configure_user.txt +++ b/configure_user.txt @@ -147,8 +147,10 @@ pvalue_fp = 0.05 ## cicero cis-interaction ## ########################################################################### RUN_Cicero = TRUE -## plot interactions within Cicero_Plot_Region on the summary report -Cicero_Plot_Region = chr5:140610000-140640000 ## you can also specify a gene name + +## plot interactions within Cicero_Plot_Region on the summary report +## you can also specify it as a genomic location, a gene name or none (skip the plot) +Cicero_Plot_Region = chr5:140610000-140640000 ############################################################################ diff --git a/scATAC-pro b/scATAC-pro index 0b43c00..77055a3 100755 --- a/scATAC-pro +++ b/scATAC-pro @@ -9,7 +9,7 @@ ######################### SOFT="scATAC-pro" -VERSION="1.3.1" +VERSION="1.4.0" function usage { echo -e "usage : $SOFT -s STEP -i INPUT -c CONFIG [-o] [-h] [-v] [-b]" diff --git a/scripts/src/get_qc_per_barcode.R b/scripts/src/get_qc_per_barcode.R index 308d6c7..48a951e 100644 --- a/scripts/src/get_qc_per_barcode.R +++ b/scripts/src/get_qc_per_barcode.R @@ -109,6 +109,7 @@ sourceCpp(code=' ) + args = commandArgs(T) frags.file = args[1] peaks.file = args[2] @@ -128,17 +129,23 @@ frags = frags[!grepl(chr, pattern = 'random', ignore.case = T)] frags = frags[!grepl(chr, pattern ='un', ignore.case = T)] peaks = fread(peaks.file, select=1:3, header = F) -promoters = fread(promoters.file, select=1:3, header = F) tss = fread(tss.file, select=1:3, header = F) -enhs = fread(enhs.file, select=1:3, header = F) -names(peaks) = names(promoters) = names(tss) = - names(enhs) = c('chr', 'start', 'end') +names(peaks) = names(tss) = c('chr', 'start', 'end') +## promoters.file and enhs.file are optional +if(file.exists(promoters.file)) { + promoters = fread(promoters.file, select=1:3, header = F) + names(promoters) = c('chr', 'start', 'end') + setkey(promoters, chr, start) +} +if(file.exists(enhs.file)) { + enhs = fread(enhs.file, select=1:3, header = F) + names(enhs) = c('chr', 'start', 'end') + setkey(enhs, chr, start) +} setkey(peaks, chr, start) -setkey(promoters, chr, start) setkey(tss, chr, start) -setkey(enhs, chr, start) chrs = unique(frags$chr) @@ -187,10 +194,8 @@ tss[, 'end' := end + 1000] fragsInRegion = NULL for(chr0 in chrs){ peaks0 = peaks[chr == chr0] - promoters0 = promoters[chr == chr0] tss0 = tss[chr == chr0] - enhs0 = enhs[chr == chr0] frags0 = frags[chr == chr0] frags = frags[chr != chr0] if(nrow(peaks0) == 0){ @@ -199,23 +204,30 @@ for(chr0 in chrs){ frags0[, 'peaks' := getOverlaps_read2AnyRegion(frags0, peaks0)] } - if(nrow(promoters0) == 0){ - frags0[, 'promoters' := 0] - }else{ - frags0[, 'promoters' := getOverlaps_read2AnyRegion(frags0, promoters0)] - } - if(nrow(tss0) == 0){ frags0[, 'tss' := 0] }else{ frags0[, 'tss' := getOverlaps_read2AnyRegion(frags0, tss0)] } - - if(nrow(enhs0) == 0){ - frags0[, 'enhs' := 0] - }else{ - frags0[, 'enhs' := getOverlaps_read2AnyRegion(frags0, enhs0)] + + if(file.exists(promoters.file)) { + promoters0 = promoters[chr == chr0] + if(nrow(promoters0) == 0){ + frags0[, 'promoters' := 0] + }else{ + frags0[, 'promoters' := getOverlaps_read2AnyRegion(frags0, promoters0)] + } } + + + if(file.exists(enhs.file)){ + enhs0 = enhs[chr == chr0] + if(nrow(enhs0) == 0){ + frags0[, 'enhs' := 0] + }else{ + frags0[, 'enhs' := getOverlaps_read2AnyRegion(frags0, enhs0)] + } + } fragsInRegion = rbind(fragsInRegion, frags0) @@ -230,12 +242,22 @@ fragsInRegion[, 'frac_mito' := sum(isMito)/total_frags, by = bc] fragsInRegion[, 'isMito' := NULL] fragsInRegion[, 'frac_peak' := sum(peaks)/total_frags, by = bc] fragsInRegion[, 'peaks' := NULL] -fragsInRegion[, 'frac_promoter' := sum(promoters)/total_frags, by = bc] -fragsInRegion[, 'promoters' := NULL] fragsInRegion[, 'frac_tss' := sum(tss)/total_frags, by = bc] fragsInRegion[, 'tss' := NULL] -fragsInRegion[, 'frac_enhancer' := sum(enhs)/total_frags, by = bc] -fragsInRegion[, 'enhs' := NULL] + +if(file.exists(promoters.file)) { + fragsInRegion[, 'frac_promoter' := sum(promoters)/total_frags, by = bc] + fragsInRegion[, 'promoters' := NULL] +}else{ + fragsInRegion[, 'frac_promoter' := 0] +} + +if(file.exists(enhs.file)) { + fragsInRegion[, 'frac_enhancer' := sum(enhs)/total_frags, by = bc] + fragsInRegion[, 'enhs' := NULL] +}else{ + fragsInRegion[, 'frac_enhancer' := 0] +} fragsInRegion = unique(fragsInRegion) fragsInRegion$tss_enrich_score = escores[fragsInRegion$bc] diff --git a/scripts/src/scATAC-pro_report.Rmd b/scripts/src/scATAC-pro_report.Rmd index cc9398a..fbc2f4c 100644 --- a/scripts/src/scATAC-pro_report.Rmd +++ b/scripts/src/scATAC-pro_report.Rmd @@ -710,80 +710,94 @@ if(file.exists(footprint_stats.file)){ ### Predicted Interactions at a locus of interest ```{r, out.width='\\textwidth'} cicero_conn.file = paste0(down.dir, '/cicero_interactions.txt') - -if(file.exists(cicero_conn.file)){ +err = 0 +if(Cicero_Plot_Region != 'none' & file.exists(cicero_conn.file)){ conns = fread(cicero_conn.file) temp <- tempfile() if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) { - download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp) - } + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz!") + return(1)}) +} if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) { - download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp) - } + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!") + return(1)}) +} if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) { - download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp) + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!") + return(1)}) } if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) { - download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp) + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!") + return(1)}) } - -gene_anno <- rtracklayer::readGFF(temp) - unlink(temp) - # rename some columns to match requirements - gene_anno$chromosome <- paste0("chr", gene_anno$seqid) - gene_anno$gene <- gene_anno$gene_id - gene_anno$transcript <- gene_anno$transcript_id - gene_anno$symbol <- gene_anno$gene_name - gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, - transcript, gene, symbol)) - gene_anno = gene_anno[complete.cases(gene_anno), ] - if(Cicero_Plot_Region %in% gene_anno$symbol){ - tmp_anno = gene_anno[gene_anno$symbol == Cicero_Plot_Region, ] - chr0 = tmp_anno$chr[1] - start0 = min(tmp_anno$start) - end0 = max(tmp_anno$end) - }else{ - chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000 - region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2] - start0 = as.integer(unlist(strsplit(region0, '-'))[1]) - end0 = as.integer(unlist(strsplit(region0, '-'))[2]) - } - -if(end0 < start0 + 100000){ - start0 = floor((start0+end0)/2) - 50000 - end0 = start0 + 100000 -} - conns[, 'chr1' := unlist(strsplit(Peak1, '_'))[1], by = Peak1] - conns[, 'start1' := as.integer(unlist(strsplit(Peak1, '_'))[2]), by = Peak1] - conns[, 'end2' := as.integer(unlist(strsplit(Peak2, '_'))[3]), by = Peak2] -conns = conns[start1 <= end2] -conns0 = conns[chr1==chr0 & start1 >= start0 & end2 <= end0] - -conns[, c('chr1', 'start1', 'end2') := NULL] -conns = data.frame(conns) -if(nrow(conns0)==0) cat('No interaction found in the griven region!') - - if(plotEPS & nrow(conns0) >0 ){ - pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans') - plot_connections(conns, chr0, start0, end0, - gene_model = gene_anno, - coaccess_cutoff = .3, - connection_width = 1, - collapseTranscripts = "longest", - viewpoint_alpha = 0) - dev.off() - - } - if(nrow(conns0) > 0) plot_connections(conns, chr0, start0, end0, - gene_model = gene_anno, - coaccess_cutoff = .3, - connection_width = 1, - collapseTranscripts = "longest", - viewpoint_alpha = 0) +if(err == 0){ + gene_anno <- rtracklayer::readGFF(temp) + unlink(temp) + + # rename some columns to match requirements + gene_anno$chromosome <- paste0("chr", gene_anno$seqid) + gene_anno$gene <- gene_anno$gene_id + gene_anno$transcript <- gene_anno$transcript_id + gene_anno$symbol <- gene_anno$gene_name + gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, + transcript, gene, symbol)) + gene_anno = gene_anno[complete.cases(gene_anno), ] + if(Cicero_Plot_Region %in% gene_anno$symbol){ + tmp_anno = gene_anno[gene_anno$symbol == Cicero_Plot_Region, ] + chr0 = tmp_anno$chr[1] + start0 = min(tmp_anno$start) + end0 = max(tmp_anno$end) + }else{ + chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000 + region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2] + start0 = as.integer(unlist(strsplit(region0, '-'))[1]) + end0 = as.integer(unlist(strsplit(region0, '-'))[2]) + } + + if(end0 < start0 + 100000){ + start0 = floor((start0+end0)/2) - 50000 + end0 = start0 + 100000 + } + conns[, 'chr1' := unlist(strsplit(Peak1, '_'))[1], by = Peak1] + conns[, 'start1' := as.integer(unlist(strsplit(Peak1, '_'))[2]), by = Peak1] + conns[, 'end2' := as.integer(unlist(strsplit(Peak2, '_'))[3]), by = Peak2] + conns = conns[start1 <= end2] + conns0 = conns[chr1==chr0 & start1 >= start0 & end2 <= end0] + + conns[, c('chr1', 'start1', 'end2') := NULL] + conns = data.frame(conns) + if(nrow(conns0)==0) cat('No interaction found in the griven region!') + + if(plotEPS & nrow(conns0) >0 ){ + pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans') + plot_connections(conns, chr0, start0, end0, + gene_model = gene_anno, + coaccess_cutoff = .3, + connection_width = 1, + collapseTranscripts = "longest", + viewpoint_alpha = 0) + dev.off() + + } + if(nrow(conns0) > 0) plot_connections(conns, chr0, start0, end0, + gene_model = gene_anno, + coaccess_cutoff = .3, + connection_width = 1, + collapseTranscripts = "longest", + viewpoint_alpha = 0) + } } ``` From f1d10500dc8f0ae275cadaab8305167b7f3429c0 Mon Sep 17 00:00:00 2001 From: wbaopaul Date: Thu, 24 Jun 2021 23:14:50 -0400 Subject: [PATCH 5/8] add labelTransfer module for cell annotation --- README.md | 16 ++++- complete_update_history.md | 2 +- scATAC-pro | 11 +++- scripts/Makefile | 10 +++ scripts/labelTransfer.sh | 23 +++++++ scripts/src/dsAnalysis_utilities.R | 81 ++++++++++++++++++++++-- scripts/src/labelTransfer.R | 98 ++++++++++++++++++++++++++++++ 7 files changed, 234 insertions(+), 7 deletions(-) create mode 100755 scripts/labelTransfer.sh create mode 100644 scripts/src/labelTransfer.R diff --git a/README.md b/README.md index b6013e4..21de7d2 100644 --- a/README.md +++ b/README.md @@ -264,6 +264,12 @@ Step by step guide to running scATAC-pro $ scATAC-pro -s integrate_mtx -i reconstructed_mtx_file1,reconstructed_mtx_file2,(reconstructed_mtx_file3...) -c configure_user.txt + + ## label transfer (cell annotation) from scRNA-seq + ## cell annotated with metadata 'Cell_Type' in seurat obj of scRNA-seq data + $ scATAC-pro -s labelTransfer + -i seurat_obj_atac.rds,seurat_obj_rna.rds(,gtf_file) + -c configure_user.txt ``` @@ -429,7 +435,15 @@ Detailed Usage input: a bam file generated by scATAC-pro output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as the input bam file) - + + labelTransfer: label transfer (cell annotation) from scRNA-seq data + input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data, and an optional gtf file for gene annotation, separated by a comma. + output: a updated seurat object added with the Predicted_Cell_Type as a metadat, + saved in the same directory as the input atac seurat object. + Note: the cell annotation should be given as a metadata of the seurat object of + scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction + done. + -i|--input INPUT : input data, different types of input data are required for different analysis -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module [-o|--output_dir : folder to save results, default output/ under the current directory; sub-folder will be created automatically for each analysis diff --git a/complete_update_history.md b/complete_update_history.md index d8b5374..2ed1945 100644 --- a/complete_update_history.md +++ b/complete_update_history.md @@ -1,6 +1,6 @@ ## Complete Update History - Version 1.4.0 - * new module planned: cellAnnotate from singleR or from scRNA-seq + * new module planned: labelTransfer from scRNA-seq - Version 1.3.1 released * *rmDoublets*: a new module added, to remove doublets * *clustering*: accepts seurat obj (in .rds format) as input as well diff --git a/scATAC-pro b/scATAC-pro index 77055a3..31e777a 100755 --- a/scATAC-pro +++ b/scATAC-pro @@ -148,6 +148,15 @@ function help { input: a bam file generated by scATAC-pro output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as the input bam file)" + + echo " labelTransfer: label transfer (cell annotation) from scRNA-seq data + input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data, and an optional gtf file for gene annotation, separated by a comma. + output: a updated seurat object added with the Predicted_Cell_Type as a metadat, + saved in the same directory as the input atac seurat object. + Note: the cell annotation should be given as a metadata of the seurat object of + scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction + done." + echo " -i|--input INPUT : input data, different types of input data are required for different analysis;" echo " -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module" echo " [-o|--output_dir : folder to save results, default 'output/' under current directory. sub-folder will be created automatically for each module" @@ -250,7 +259,7 @@ fi ################################ check valid STEPs ##### ############################ -AVAILABLE_STEP_ARRAY=("demplx_fastq" "trimming" "mapping" "mapping_qc" "aggr_signal" "call_peak" "recall_peak" "get_mtx" "qc_per_barcode" "call_cell" "get_bam4Cells" "clustering" "motif_analysis" "call_cnv" "runDA" "runGO" "runCicero" "split_bam" "footprint" "report" "process" "process_no_dex" "process_with_bam" "integrate" "downstream" "all" "integrate_seu" "integrate_mtx" "convert10xbam" "mergePeaks" "reConstMtx" "visualize" "addCB2bam" "rmDoublets") +AVAILABLE_STEP_ARRAY=("demplx_fastq" "trimming" "mapping" "mapping_qc" "aggr_signal" "call_peak" "recall_peak" "get_mtx" "qc_per_barcode" "call_cell" "get_bam4Cells" "clustering" "motif_analysis" "call_cnv" "runDA" "runGO" "runCicero" "split_bam" "footprint" "report" "process" "process_no_dex" "process_with_bam" "integrate" "downstream" "all" "integrate_seu" "integrate_mtx" "convert10xbam" "mergePeaks" "reConstMtx" "visualize" "addCB2bam" "rmDoublets" "labelTransfer") check_s=0 diff --git a/scripts/Makefile b/scripts/Makefile index 2310003..c13b50d 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -363,6 +363,16 @@ rmDoublets: config_check @$(SCRIPTS)/rmDoublets.sh $(INPUT_FILE) $(CONFIG_FILE) $(CONFIG_SYS) @echo +####################################### +## labelTransfer +####################################### +labelTransfer: config_check + @echo "--------------------------------------------" + @date + @echo "label transfer from scRNA-seq..." + @$(SCRIPTS)/labelTransfer.sh $(INPUT_FILE) $(CONFIG_FILE) $(CONFIG_SYS) + @echo + ####################################### ## all ####################################### diff --git a/scripts/labelTransfer.sh b/scripts/labelTransfer.sh new file mode 100755 index 0000000..73075dd --- /dev/null +++ b/scripts/labelTransfer.sh @@ -0,0 +1,23 @@ +#!/bin/bash + + +inputs=$1 ## seurat obj,and expeted doublet rate, + +# reading configure file +curr_dir=`dirname $0` +source ${curr_dir}/read_conf.sh +read_conf "$2" +read_conf "$3" + + +inputs=(${inputs//,/ }) +seuratObj_atac_file=${inputs[0]} +seuratObj_rna_file=${inputs[1]} +gtf_file=${inputs[2]} + +curr_dir=`dirname $0` + +${R_PATH}/Rscript --vanilla ${curr_dir}/src/labelTransfer.R $seuratObj_atac_file $seuratObj_rna_file $GENOME_NAME $gtf_file + +echo "Transfer Cell_Type label from scRNA-seq done!" + diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R index 792bf2e..0d097f5 100644 --- a/scripts/src/dsAnalysis_utilities.R +++ b/scripts/src/dsAnalysis_utilities.R @@ -566,7 +566,7 @@ do_DA <- function(mtx_score, clusters, test = 'wilcox', } return(res) } - +do_DA = cmpfun(do_DA) #fg_genes: vector of forground genes #bg_genes: background genes @@ -827,6 +827,7 @@ normalize_gene_activities.corrected <- function (activity_matrices, cell_num_gen } return(ret) } +normalize_gene_activities.corrected = cmpfun(normalize_gene_activities.corrected) # do cicero given a Seurat object, output gene activity score doCicero_gascore <- function(seurat.obj, reduction = 'tsne', chr_sizes, @@ -899,7 +900,7 @@ doCicero_gascore <- function(seurat.obj, reduction = 'tsne', chr_sizes, res = list('conns' = conns, 'ga_score' = cicero_gene_activities) return(res) } - +doCicero_gascore = cmpfun(doCicero_gascore) # do cicero given a Seurat object, just return the connection doCicero_conn <- function(seurat.obj, reduction = 'tsne', @@ -948,7 +949,7 @@ doCicero_conn <- function(seurat.obj, reduction = 'tsne', conns = conns[coaccess > coaccess_thr, ] return(conns) } - +doCicero_conn = cmpfun(doCicero_conn) ## change rowname of zscores (tf name) from chromvar to be readable readable_tf <- function(sele.zscores, GENOME_NAME){ @@ -1095,6 +1096,7 @@ runDiffMotifEnrich <- function(mtx_score, clusters, test = 'wilcox', } return(res) } +runDiffMotifEnrich = cmpfun(runDiffMotifEnrich) ## mtx_objs: a list of matrix run_integration <- function(mtx_list, integrate_by = 'VFACS', @@ -1208,7 +1210,7 @@ run_integration <- function(mtx_list, integrate_by = 'VFACS', return(seurat.obj) } - +run_integration = cmpfun(run_integration) # Find doublets FindDoublets_Atac <- function(seurat.atac, PCs = 1:50, @@ -1258,4 +1260,75 @@ FindDoublets_Atac <- function(seurat.atac, PCs = 1:50, seurat.atac$Doublet_Singlet = seurat.rna$Doublet_Singlet return(seurat.atac) } +FindDoublets_Atac = cmpfun(FindDoublets_Atac) + +# generate gene activity matrix for labelTransfer +generate_gene_cisActivity <- function(gene_gtf, mtx, include_body = T, + dist_to_tss = 2000){ + ## generating gene cis activity score + ## input: gene gtf file, and atac matrix file + + ## 1. select gene up/down-stream regions (promoter with/without gene_body) ## + + gene_ann = fread(gene_gtf, sep = '\t') + gene_ann = gene_ann[V3 == 'gene'] + gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9] + gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name] + gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name] + names(gene_ann)[1] = 'chr' + gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name)) + chrs = 1:22 + chrs = c(chrs, 'X', 'Y') + gene_ann = gene_ann[chr %in% chrs] + gene_ann = gene_ann[!duplicated(gene_name)] + + + names(gene_ann)[2:4] = c('ss', 'ee', 'strand') + if(!include_body){ + gene_ann[, 'start' := ifelse(strand == '+', ss - dist_to_tss, ee - dist_to_tss)] + gene_ann[, 'end' := ifelse(strand == '+', ss + dist_to_tss, ee + dist_to_tss)] + }else{ + gene_ann[, 'start' := ifelse(strand == '+', ss - dist_to_tss, ss)] + gene_ann[, 'end' := ifelse(strand == '+', ee, ee + dist_to_tss)] + + } + + gene_ann[, 'chr' := paste0('chr', chr)] + + gene_ann = subset(gene_ann, select = c(chr, start, end, gene_name)) + + + ## 2. read mtx file ## + + rnames = rownames(mtx) + chrs = sapply(rnames, function(x) unlist(strsplit(x, '-'))[1]) + starts = sapply(rnames, function(x) unlist(strsplit(x, '-'))[2]) + ends = sapply(rnames, function(x) unlist(strsplit(x, '-'))[3]) + + peaks = data.table('chr' = chrs, 'start' = as.integer(starts), + 'end' = as.integer(ends)) + setkey(peaks, chr, start, end) + peaks[, 'pname' := paste(chr, start, end, sep = '-')] + over.ids = foverlaps(gene_ann, peaks, by.x = c('chr', 'start', 'end'), + by.y = c('chr', 'start', 'end'), which = T) + over.ids[, 'gene_name' := gene_ann[xid, gene_name]] + over.ids[, 'pname' := peaks[yid, pname]] + over.ids = over.ids[complete.cases(over.ids)] + + + smtx = sparseMatrix(i = over.ids$xid, j = over.ids$yid, + dimnames = list(gene_ann$gene_name[1:max(over.ids$xid)], + peaks$pname[1:max(over.ids$yid)])) + + mtx = mtx[rownames(mtx) %in% colnames(smtx), ] + smtx = smtx[, rownames(mtx)] + + activity.matrix = smtx %*% mtx + rs = Matrix::rowSums(activity.matrix) + activity.matrix = activity.matrix[rs > 10, ] + + return(activity.matrix) + +} +generate_gene_cisActivity = cmpfun(generate_gene_cisActivity) \ No newline at end of file diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R new file mode 100644 index 0000000..375dccd --- /dev/null +++ b/scripts/src/labelTransfer.R @@ -0,0 +1,98 @@ +source_local <- function(fname){ + argv <- commandArgs(trailingOnly = FALSE) + base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) + source(paste(base_dir, fname, sep="/")) +} + +source_local('dsAnalysis_utilities.R') + +args = commandArgs(T) +inputSeurat_atac = args[1] ## a seurat.rds file, with pca conducted and metadata Cell_Type +inputSeurat_rna = args[2] ## a seurat.rds file, with pca conducted +GENOME_NAME = args[3] +gene_gtf_file = args[4] + +## download gtf file if not provided +if(!file.exists(gene_gtf_file)){ + print('gene annotation gtf file not provided, I will try to download one:') + err = 0 + if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) { + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!") + return(1)}) + } + + if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) { + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!") + return(1)}) + } + + if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) { + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!") + return(1)}) + } + if(err == 1) stop('Download failed! Please provide a gtf file to run this module!') +} + +seurat.rna = readRDS(inputSeurat_rna) +seurat.atac = readRDS(inputSeurat_atac) + +atac.mtx = seurat.atac@assays$ATAC@counts +rn = rownames(atac.mtx) +rownames(atac.mtx) <- sapply(rn, function(x) unlist(strsplit(x, ','))[1]) +activity.matrix = generate_gene_cisActivity(gene_gtf = gene_gtf_file, + atac.mtx, + include_body = T) + +seurat.atac[["ACTIVITY"]] <- CreateAssayObject(counts = activity.matrix) + +DefaultAssay(seurat.atac) <- "ACTIVITY" +seurat.atac <- NormalizeData(seurat.atac) +seurat.atac <- ScaleData(seurat.atac, features = rownames(seurat.atac)) +seurat.atac <- FindVariableFeatures(seurat.atac) + +DefaultAssay(seurat.atac) <- "ATAC" +seurat.atac$tech = 'ATAC' +seurat.rna$tech = 'RNA' + +## transfer label +genes4anchors = VariableFeatures(object = seurat.rna) +#genes4anchors = NULL +transfer.anchors <- FindTransferAnchors(reference = seurat.rna, + query = seurat.atac, + features = genes4anchors, + reference.assay = "RNA", + query.assay = "ACTIVITY", + reduction = "cca", + k.anchor = 5) + + +celltype.predictions <- TransferData(anchorset = transfer.anchors, + refdata = seurat.rna$Cell_Type, + weight.reduction = seurat.atac[["pca"]], + dims = 1:ncol(seurat.atac[["pca"]]), + k.weight = 50) +celltype.predictions = subset(celltype.predictions, select = c('predicted.id', 'prediction.score.max')) +names(celltype.predictions)[1] = 'Predicted_Cell_Type' +seurat.atac <- AddMetaData(seurat.atac, metadata = celltype.predictions) +rm(transfer.anchors) + +p1 <- DimPlot(seurat.atac, group.by = "Predicted_Cell_Type", + label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") + + +seurat.atac[["ACTIVITY"]] <- NULL ## don't save activity assay + +outputPath = dirname(inputSeurat_atac) +outputName = basename(inputSeurat_atac) +saveRDS(seurat.atac, file = paste0(outputPath, '/updated_', outputName)) + +ggsave(p1, filename = paste0(outputPath, '/umap_with_predicted_cell_type.eps'), + device = 'eps', height = 6, width = 7) + + From 89ebe82727201317417be8c6b0c29de1398fd6c5 Mon Sep 17 00:00:00 2001 From: wbaopaul Date: Fri, 25 Jun 2021 15:06:22 -0400 Subject: [PATCH 6/8] update readme for v1.4.0 --- README.md | 24 ++++++++++++++---------- complete_update_history.md | 4 ++-- scATAC-pro | 15 ++++++++------- scripts/src/dsAnalysis_utilities.R | 2 +- scripts/src/labelTransfer.R | 10 +++++++--- 5 files changed, 32 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 21de7d2..3c653dc 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ Updates - Now provide [scATAC-pro tutorial in R](https://scatacpro-in-r.netlify.app/index.html) for access QC metrics and perform downstream analysis - Current version: 1.4.0 - Recent updates + * *labelTransfer*: new module added, to do label trasfer (for cell annotation) from scRNA-seq annotation. First construct a gene by cell activity matrix, then use *FindTransferAnchors* and *TransferData* function from Seurat R package to predicted cell type annotation from the cell annotaiton in scRNA-seq data. * *rmDoublets*: new module added, to remove potential doublets using [DoubletFinder](https://github.com/chris-mcginnis-ucsf/DoubletFinder) algorithm. * *qc_per_barcode*: add tss enrichment score per cell into the QC metrics * *call_cell*: enable filtering barcodes with minimal tss enrichment score cutoff (parameter **min_tss_escore** in the updated [configure_user.txt](configure_user.txt) file) @@ -146,7 +147,7 @@ Step by step guide to running scATAC-pro - **IMPORTANT**: you can run scATAC-pro sequentially. The input of a later analysis module is the output of the previous analysis modules. The following tutorial uses fastq files downloaded from [PBMC10k 10X Genomics](https://support.10xgenomics.com/single-cell-atac/datasets/1.1.0/atac_v1_pbmc_10k?) -- *Combine data from different sequencing lanes* +- *Combine data from different sequencing lanes* (not needed for 10x genomics data since v1.1.4) $ cat pbmc_fastqs/atac_pbmc_10k_v1_S1_L001_R1_001.fastq.gz pbmc_fastqs/atac_pbmc_10k_v1_S1_L002_R1_001.fastq.gz > pe1_fastq.gz @@ -158,11 +159,11 @@ Step by step guide to running scATAC-pro ``` $ scATAC-pro -s demplx_fastq - -i pe1_fastq.gz,pe2_fastq.gz,index_fastq.gz + -i pe1_fastq.gz,pe2_fastq.gz,index_fastq.gz(,other_index_fastq.gz, ...) -c configure_user.txt - # or + # or for 10x data $ scATAC-pro -s demplx_fastq - -i pbmc_fastqs/ + -i pbmc_10x_fastqs/ -c configure_user.txt $ scATAC-pro -s trimming @@ -219,8 +220,8 @@ Step by step guide to running scATAC-pro -i output/downstream_analysis/PEAK_CALLER/CELL_CALLER/cell_cluster_table.tsv -c configure_user.txt - $ scATAC-pro -s footprint ## supporting comparison two clusters, and one-vs-rest - -i 0,1 ## or '0,rest' (means cluster1 vs rest) or 'one,rest' (all one-vs-rest) + $ scATAC-pro -s footprint ## supporting comparison two groups of cell clusters, and one-vs-rest + -i 0,1 ## or '0:3,1:2' (group1 consist of cluster0,3, and group2 for cluster1,2)) or 'one,rest' (all one-vs-rest comparison) -c configure_user.txt $ scATAC-pro -s runCicero @@ -265,8 +266,10 @@ Step by step guide to running scATAC-pro -i reconstructed_mtx_file1,reconstructed_mtx_file2,(reconstructed_mtx_file3...) -c configure_user.txt + ## label transfer (cell annotation) from scRNA-seq ## cell annotated with metadata 'Cell_Type' in seurat obj of scRNA-seq data + ## the gtf_file is optional $ scATAC-pro -s labelTransfer -i seurat_obj_atac.rds,seurat_obj_rna.rds(,gtf_file) -c configure_user.txt @@ -437,10 +440,11 @@ Detailed Usage the input bam file) labelTransfer: label transfer (cell annotation) from scRNA-seq data - input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data, and an optional gtf file for gene annotation, separated by a comma. - output: a updated seurat object added with the Predicted_Cell_Type as a metadat, - saved in the same directory as the input atac seurat object. - Note: the cell annotation should be given as a metadata of the seurat object of + input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format, and an optional .gtf file for gene annotation, separated by a comma. + output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and + an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac + seurat object. + *Note*: the cell annotation should be given as a metadata of the seurat object of scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction done. diff --git a/complete_update_history.md b/complete_update_history.md index 2ed1945..2119089 100644 --- a/complete_update_history.md +++ b/complete_update_history.md @@ -1,6 +1,6 @@ ## Complete Update History -- Version 1.4.0 - * new module planned: labelTransfer from scRNA-seq +- Version 1.4.0 released + * new module added: labelTransfer (for cell annotation) from scRNA-seq - Version 1.3.1 released * *rmDoublets*: a new module added, to remove doublets * *clustering*: accepts seurat obj (in .rds format) as input as well diff --git a/scATAC-pro b/scATAC-pro index 31e777a..0175a62 100755 --- a/scATAC-pro +++ b/scATAC-pro @@ -149,13 +149,14 @@ function help { output: the bam file with column 'CB:Z:cellbarcode' added (saved in the same directory as the input bam file)" - echo " labelTransfer: label transfer (cell annotation) from scRNA-seq data - input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data, and an optional gtf file for gene annotation, separated by a comma. - output: a updated seurat object added with the Predicted_Cell_Type as a metadat, - saved in the same directory as the input atac seurat object. - Note: the cell annotation should be given as a metadata of the seurat object of - scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction - done." + echo " label transfer (cell annotation) from scRNA-seq data + input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format, and an optional .gtf file for gene annotation, separated by a comma. + output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and + an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac + seurat object. + *Note*: the cell annotation should be given as a metadata of the seurat object of + scRNA-seq. Both seurat objects should have pca and umap dimemsion reduction + done." echo " -i|--input INPUT : input data, different types of input data are required for different analysis;" echo " -c|--conf CONFIG : configuration file for parameters (if exist) for each analysis module" diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R index 0d097f5..f3fa66b 100644 --- a/scripts/src/dsAnalysis_utilities.R +++ b/scripts/src/dsAnalysis_utilities.R @@ -1331,4 +1331,4 @@ generate_gene_cisActivity <- function(gene_gtf, mtx, include_body = T, } -generate_gene_cisActivity = cmpfun(generate_gene_cisActivity) \ No newline at end of file +generate_gene_cisActivity = cmpfun(generate_gene_cisActivity) diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R index 375dccd..c1f0050 100644 --- a/scripts/src/labelTransfer.R +++ b/scripts/src/labelTransfer.R @@ -83,8 +83,10 @@ seurat.atac <- AddMetaData(seurat.atac, metadata = celltype.predictions) rm(transfer.anchors) p1 <- DimPlot(seurat.atac, group.by = "Predicted_Cell_Type", - label = TRUE, repel = TRUE) + ggtitle("scATAC-seq cells") + label = TRUE, repel = TRUE) + ggtitle("scATAC-seq") +p2 <- DimPlot(seurat.rna, group.by = "Cell_Type", + label = TRUE, repel = TRUE) + ggtitle("scRNA-seq") seurat.atac[["ACTIVITY"]] <- NULL ## don't save activity assay @@ -92,7 +94,9 @@ outputPath = dirname(inputSeurat_atac) outputName = basename(inputSeurat_atac) saveRDS(seurat.atac, file = paste0(outputPath, '/updated_', outputName)) -ggsave(p1, filename = paste0(outputPath, '/umap_with_predicted_cell_type.eps'), - device = 'eps', height = 6, width = 7) +pcomb = gridExtra::grid.arrange(p1, p2, nrow = 1) + +ggsave(pcomb, filename = paste0(outputPath, '/umap_with_predicted_cell_type.eps'), + device = 'eps', height = 6, width = 13) From 217d296ab03b596ceaf08e405c8f1ac9a26a2261 Mon Sep 17 00:00:00 2001 From: wbaopaul Date: Tue, 29 Jun 2021 11:31:57 -0400 Subject: [PATCH 7/8] test v1.4.0 --- README.md | 3 +- scripts/src/dsAnalysis_utilities.R | 28 ++---- scripts/src/labelTransfer.R | 116 ++++++++++++++++++---- scripts/src/scATAC-pro_report.Rmd | 152 ++++++++++++++--------------- 4 files changed, 181 insertions(+), 118 deletions(-) diff --git a/README.md b/README.md index 3c653dc..ef28ddf 100644 --- a/README.md +++ b/README.md @@ -440,7 +440,8 @@ Detailed Usage the input bam file) labelTransfer: label transfer (cell annotation) from scRNA-seq data - input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format, and an optional .gtf file for gene annotation, separated by a comma. + input: paths for a seurat object for scATAC-seq, a seurat object for scRNA-seq data in .rds format, + and an optional .gtf file for gene annotation, separated by a comma. output: a updated seurat object for atac with the Predicted_Cell_Type as a metadata variable and an umap plot colored by Predicted_Cell_Type, saved in the same directory as the input atac seurat object. diff --git a/scripts/src/dsAnalysis_utilities.R b/scripts/src/dsAnalysis_utilities.R index f3fa66b..ea68aaf 100644 --- a/scripts/src/dsAnalysis_utilities.R +++ b/scripts/src/dsAnalysis_utilities.R @@ -1263,38 +1263,24 @@ FindDoublets_Atac <- function(seurat.atac, PCs = 1:50, FindDoublets_Atac = cmpfun(FindDoublets_Atac) # generate gene activity matrix for labelTransfer -generate_gene_cisActivity <- function(gene_gtf, mtx, include_body = T, +generate_gene_cisActivity <- function(gene_ann, mtx, include_body = T, dist_to_tss = 2000){ ## generating gene cis activity score - ## input: gene gtf file, and atac matrix file + ## input: gene_ann (data table with chr, gene_start, gene_end, strand, gene_name), + ## and atac matrix file ## 1. select gene up/down-stream regions (promoter with/without gene_body) ## - gene_ann = fread(gene_gtf, sep = '\t') - gene_ann = gene_ann[V3 == 'gene'] - gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9] - gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name] - gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name] - names(gene_ann)[1] = 'chr' - gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name)) - chrs = 1:22 - chrs = c(chrs, 'X', 'Y') - gene_ann = gene_ann[chr %in% chrs] - gene_ann = gene_ann[!duplicated(gene_name)] - - names(gene_ann)[2:4] = c('ss', 'ee', 'strand') if(!include_body){ - gene_ann[, 'start' := ifelse(strand == '+', ss - dist_to_tss, ee - dist_to_tss)] - gene_ann[, 'end' := ifelse(strand == '+', ss + dist_to_tss, ee + dist_to_tss)] + gene_ann[, 'start' := ifelse(strand == '+', gene_start - dist_to_tss, gene_end - dist_to_tss)] + gene_ann[, 'end' := ifelse(strand == '+', gene_start + dist_to_tss, gene_end + dist_to_tss)] }else{ - gene_ann[, 'start' := ifelse(strand == '+', ss - dist_to_tss, ss)] - gene_ann[, 'end' := ifelse(strand == '+', ee, ee + dist_to_tss)] + gene_ann[, 'start' := ifelse(strand == '+', gene_start - dist_to_tss, gene_start)] + gene_ann[, 'end' := ifelse(strand == '+', gene_end, gene_end + dist_to_tss)] } - gene_ann[, 'chr' := paste0('chr', chr)] - gene_ann = subset(gene_ann, select = c(chr, start, end, gene_name)) diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R index c1f0050..7ce81bf 100644 --- a/scripts/src/labelTransfer.R +++ b/scripts/src/labelTransfer.R @@ -12,31 +12,113 @@ inputSeurat_rna = args[2] ## a seurat.rds file, with pca conducted GENOME_NAME = args[3] gene_gtf_file = args[4] -## download gtf file if not provided +## if gtf file not provided, using R bioconductor packages for gene annotation if(!file.exists(gene_gtf_file)){ - print('gene annotation gtf file not provided, I will try to download one:') - err = 0 + if(!grepl(GENOME_NAME, pattern = 'hg19|hg38|mm10|mm9', ignore.case = T)){ + stop('Genome is not belong to hg19,hg38,mm9 and mm10, + please provide .gtf file for gene annotation!') + } + if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) { + library(EnsDb.Mmusculus.v79) + ens.ann <- EnsDb.Mmusculus.v79 + } if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!") - return(1)}) + library(EnsDb.Mmusculus.v75) + ens.ann <- EnsDb.Mmusculus.v75 } if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!") - return(1)}) + library(EnsDb.Hsapiens.v86) + ens.ann <- EnsDb.Hsapiens.v86 } if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!") - return(1)}) + library(EnsDb.Hsapiens.v75) + ens.ann <- EnsDb.Hsapiens.v75 } - if(err == 1) stop('Download failed! Please provide a gtf file to run this module!') + + gene_ann <- ensembldb::genes(ens.ann) + gene_ann <- keepStandardChromosomes(gene_ann, pruning.mode = 'coarse') + gene_ann = data.frame(gene_ann) + gene_ann = subset(gene_ann, gene_biotype %in% c('protein_coding', 'miRNA', 'lincRNA'), + select = c('seqnames', 'start', 'end', + 'strand', 'gene_biotype', 'gene_name')) + + names(gene_ann)[1:3] = c('chr', 'gene_start', 'gene_end') + gene_ann = data.table(gene_ann) + gene_ann = gene_ann[!duplicated(gene_name)] + gene_ann$chr = paste0('chr', gene_ann$chr) + gene_ann = subset(gene_ann, select = c('chr', 'gene_start', 'gene_end', + 'strand', 'gene_name')) + + gene_ann[chr == 'chrMT']$chr = 'chrM' + +}else{ + gene_ann = fread(gene_gtf_file, sep = '\t') + gene_ann = gene_ann[V3 == 'gene'] + gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9] + gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name] + gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name] + names(gene_ann)[1] = 'chr' + gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name)) + chrs = 1:22 + chrs = c(chrs, 'X', 'Y', 'M') + gene_ann = gene_ann[chr %in% chrs] + gene_ann = gene_ann[!duplicated(gene_name)] + names(gene_ann)[2:4] = c('gene_start', 'gene_end', 'strand') + gene_ann[, 'chr' := paste0('chr', chr)] + +} + + +if(F){ + ## download gtf file if not provided + if(!file.exists(gene_gtf_file)){ + print('gene annotation gtf file not provided, I will try to download one:') + err = 0 + if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) { + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!") + return(1)}) + } + if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) { + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz!") + return(1)}) + } + if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) { + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!") + return(1)}) + } + + if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) { + err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp), + error = function(e) { + print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!") + return(1)}) + } + + if(err == 1) stop('Download failed! Please provide a gtf file to run this module!') + } + + gene_ann = fread(gene_gtf_file, sep = '\t') + gene_ann = gene_ann[V3 == 'gene'] + gene_ann[, 'gene_name' := unlist(strsplit(V9, ';'))[3], by = V9] + gene_ann[, 'gene_name' := gsub("\"", "", gene_name), by = gene_name] + gene_ann[, 'gene_name' := unlist(strsplit(gene_name, ' '))[3], by = gene_name] + names(gene_ann)[1] = 'chr' + gene_ann = subset(gene_ann, select = c(chr, V4, V5, V7, gene_name)) + chrs = 1:22 + chrs = c(chrs, 'X', 'Y') + gene_ann = gene_ann[chr %in% chrs] + gene_ann = gene_ann[!duplicated(gene_name)] + names(gene_ann)[2:4] = c('gene_start', 'gene_end', 'strand') + gene_ann[, 'chr' := paste0('chr', chr)] + } seurat.rna = readRDS(inputSeurat_rna) @@ -45,7 +127,7 @@ seurat.atac = readRDS(inputSeurat_atac) atac.mtx = seurat.atac@assays$ATAC@counts rn = rownames(atac.mtx) rownames(atac.mtx) <- sapply(rn, function(x) unlist(strsplit(x, ','))[1]) -activity.matrix = generate_gene_cisActivity(gene_gtf = gene_gtf_file, +activity.matrix = generate_gene_cisActivity(gene_ann = gene_ann, atac.mtx, include_body = T) diff --git a/scripts/src/scATAC-pro_report.Rmd b/scripts/src/scATAC-pro_report.Rmd index fbc2f4c..13cec8a 100644 --- a/scripts/src/scATAC-pro_report.Rmd +++ b/scripts/src/scATAC-pro_report.Rmd @@ -710,94 +710,88 @@ if(file.exists(footprint_stats.file)){ ### Predicted Interactions at a locus of interest ```{r, out.width='\\textwidth'} cicero_conn.file = paste0(down.dir, '/cicero_interactions.txt') -err = 0 if(Cicero_Plot_Region != 'none' & file.exists(cicero_conn.file)){ conns = fread(cicero_conn.file) temp <- tempfile() -if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz!") - return(1)}) -} - -if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz!") - return(1)}) -} - -if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz!") - return(1)}) -} - -if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) { - err <- tryCatch(download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp), - error = function(e) { - print("Cannot download ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz!") - return(1)}) -} + if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) { + library(EnsDb.Mmusculus.v79) + ens.ann <- EnsDb.Mmusculus.v79 + } + if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) { + library(EnsDb.Mmusculus.v75) + ens.ann <- EnsDb.Mmusculus.v75 + } + + if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) { + library(EnsDb.Hsapiens.v86) + ens.ann <- EnsDb.Hsapiens.v86 + } + + if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) { + library(EnsDb.Hsapiens.v75) + ens.ann <- EnsDb.Hsapiens.v75 + } + + gene_anno <- transcripts(ens.ann, + columns = c("gene_id", "gene_name", + "tx_biotype", "tx_id")) -if(err == 0){ - gene_anno <- rtracklayer::readGFF(temp) - unlink(temp) - - # rename some columns to match requirements - gene_anno$chromosome <- paste0("chr", gene_anno$seqid) - gene_anno$gene <- gene_anno$gene_id - gene_anno$transcript <- gene_anno$transcript_id - gene_anno$symbol <- gene_anno$gene_name - gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, - transcript, gene, symbol)) - gene_anno = gene_anno[complete.cases(gene_anno), ] - if(Cicero_Plot_Region %in% gene_anno$symbol){ - tmp_anno = gene_anno[gene_anno$symbol == Cicero_Plot_Region, ] - chr0 = tmp_anno$chr[1] - start0 = min(tmp_anno$start) - end0 = max(tmp_anno$end) - }else{ - chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000 - region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2] - start0 = as.integer(unlist(strsplit(region0, '-'))[1]) - end0 = as.integer(unlist(strsplit(region0, '-'))[2]) - } - - if(end0 < start0 + 100000){ - start0 = floor((start0+end0)/2) - 50000 - end0 = start0 + 100000 - } - conns[, 'chr1' := unlist(strsplit(Peak1, '_'))[1], by = Peak1] - conns[, 'start1' := as.integer(unlist(strsplit(Peak1, '_'))[2]), by = Peak1] - conns[, 'end2' := as.integer(unlist(strsplit(Peak2, '_'))[3]), by = Peak2] - conns = conns[start1 <= end2] - conns0 = conns[chr1==chr0 & start1 >= start0 & end2 <= end0] - - conns[, c('chr1', 'start1', 'end2') := NULL] - conns = data.frame(conns) - if(nrow(conns0)==0) cat('No interaction found in the griven region!') - - if(plotEPS & nrow(conns0) >0 ){ - pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans') - plot_connections(conns, chr0, start0, end0, - gene_model = gene_anno, - coaccess_cutoff = .3, - connection_width = 1, - collapseTranscripts = "longest", - viewpoint_alpha = 0) - dev.off() - - } - if(nrow(conns0) > 0) plot_connections(conns, chr0, start0, end0, + gene_anno = keepStandardChromosomes(gene_anno,pruning.mode = 'coarse') + + gene_anno = data.table(data.frame(gene_anno)) + gene_anno = subset(gene_anno, tx_biotype %in% c('protein_coding', 'miRNA', 'lincRNA'), select = c('seqnames', 'start', 'end', 'tx_id', + 'strand', 'gene_id', 'gene_name')) + #rename some columns to match requirements + names(gene_anno)[c(1, 4, 6:7)] = c('chromosome', 'transcript', 'gene', 'symbol') + gene_anno$chromosome = paste0('chr', gene_anno$chromosome) + + gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand, + transcript, gene, symbol)) + gene_anno = gene_anno[complete.cases(gene_anno), ] + if(Cicero_Plot_Region %in% gene_anno$symbol){ + tmp_anno = gene_anno[gene_anno$symbol == Cicero_Plot_Region, ] + chr0 = tmp_anno$chr[1] + start0 = min(tmp_anno$start) + end0 = max(tmp_anno$end) + }else{ + chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000 + region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2] + start0 = as.integer(unlist(strsplit(region0, '-'))[1]) + end0 = as.integer(unlist(strsplit(region0, '-'))[2]) + } + + if(end0 < start0 + 100000){ + start0 = floor((start0+end0)/2) - 50000 + end0 = start0 + 100000 + } + conns[, 'chr1' := unlist(strsplit(Peak1, '_'))[1], by = Peak1] + conns[, 'start1' := as.integer(unlist(strsplit(Peak1, '_'))[2]), by = Peak1] + conns[, 'end2' := as.integer(unlist(strsplit(Peak2, '_'))[3]), by = Peak2] + conns = conns[start1 <= end2] + conns0 = conns[chr1==chr0 & start1 >= start0 & end2 <= end0] + + conns[, c('chr1', 'start1', 'end2') := NULL] + conns = data.frame(conns) + if(nrow(conns0)==0) cat('No interaction found in the griven region!') + + if(plotEPS & nrow(conns0) >0 ){ + pdf(paste0(params$output_dir,'/summary/Figures/interactions_example_region.pdf'), width = 8, height = 5, fonts = 'sans') + plot_connections(conns, chr0, start0, end0, gene_model = gene_anno, coaccess_cutoff = .3, connection_width = 1, collapseTranscripts = "longest", viewpoint_alpha = 0) - } + dev.off() + + } + if(nrow(conns0) > 0) plot_connections(conns, chr0, start0, end0, + gene_model = gene_anno, + coaccess_cutoff = .3, + connection_width = 1, + collapseTranscripts = "longest", + viewpoint_alpha = 0) } + ``` From 33fcc1e62609406f9ba51ae1645ae943b01661e6 Mon Sep 17 00:00:00 2001 From: wbaopaul Date: Tue, 29 Jun 2021 11:52:00 -0400 Subject: [PATCH 8/8] tested v1.4.0 --- README.md | 2 +- scripts/install/install_Rpackages.R | 2 +- scripts/src/labelTransfer.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ef28ddf..7654fc1 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ Dependencies - trim\_galore (>=0.6.3), Trimmomatic (>=0.6.3) - Regulratory Genomics Toolbox (RGT, for footprinting analysis) - g++ compiler, bzip2, ncurses-devel -- R packaages: devtools, flexdashboard, png, data.table, Matirx, Rcpp, ggplot2, flexmix, optparse, magrittr, readr, Seurat, bedr, gridExtra, ggrepel, kableExtra, viridis, xlsx, RColorBrewer,pheatmap,motifmatchr, chromVAR, chromVARmotifs, SummarizedExperiment, BiocParallel, DESeq2, clusterProfiler, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Mmusculus.UCSC.mm10, VisCello.atac +- R packaages: devtools, flexdashboard, png, data.table, Matirx, Rcpp, ggplot2, flexmix, optparse, magrittr, readr, Seurat, bedr, gridExtra, ggrepel, kableExtra, viridis, xlsx, RColorBrewer,pheatmap,motifmatchr, chromVAR, chromVARmotifs, SummarizedExperiment, BiocParallel, DESeq2, clusterProfiler, BSgenome.Hsapiens.UCSC.hg38, BSgenome.Mmusculus.UCSC.mm10, EnsDb.Hsapiens.v86, EnsDb.Mmusculus.v79, VisCello.atac Quick start guide ----------- diff --git a/scripts/install/install_Rpackages.R b/scripts/install/install_Rpackages.R index 9b91bff..7955ec4 100644 --- a/scripts/install/install_Rpackages.R +++ b/scripts/install/install_Rpackages.R @@ -18,7 +18,7 @@ for(pk in pks){ } bioc.pks = c('RColorBrewer','pheatmap','motifmatchr', 'chromVAR', 'SummarizedExperiment', 'BiocParallel', 'DESeq2', 'edgeR', 'matrixStats', 'cicero', 'farver', 'BSgenome.Hsapiens.UCSC.hg38', 'BSgenome.Mmusculus.UCSC.mm10', 'clusterProfiler', - 'DropletUtils') + 'DropletUtils', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79') for(pk in bioc.pks){ if(!require(pk, character.only = T)) { diff --git a/scripts/src/labelTransfer.R b/scripts/src/labelTransfer.R index 7ce81bf..69ec8fd 100644 --- a/scripts/src/labelTransfer.R +++ b/scripts/src/labelTransfer.R @@ -15,7 +15,7 @@ gene_gtf_file = args[4] ## if gtf file not provided, using R bioconductor packages for gene annotation if(!file.exists(gene_gtf_file)){ if(!grepl(GENOME_NAME, pattern = 'hg19|hg38|mm10|mm9', ignore.case = T)){ - stop('Genome is not belong to hg19,hg38,mm9 and mm10, + stop('Genome is not belong to any of hg19,hg38,mm9 or mm10, please provide .gtf file for gene annotation!') } if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {