From 7f9eca1155099448640933c8bc001be9a38cca06 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 10 Apr 2024 15:55:39 -0400 Subject: [PATCH 1/8] Removed check for DESeq2 single-sample group - does not work properly --- workflow/rules/diff.smk | 154 ++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 86 deletions(-) diff --git a/workflow/rules/diff.smk b/workflow/rules/diff.smk index 9d5193b..1e5ab24 100644 --- a/workflow/rules/diff.smk +++ b/workflow/rules/diff.smk @@ -194,96 +194,78 @@ rule DESeq: mkdir -p ${{TMPDIR_AUC}}/knit_root_dir cd $TMPDIR_AUC - # if there is only one sample per group, DESEQ2 is not the tool to use - # check the sampleinfo file to determine how many samples are in each group. If any samplegroupp has a count of 1 - # create empty files with message that the file cannot be created due to conditions provided - search_list=`awk '{{print $2}}' {input.si} | grep -v "group" | sort | uniq -c | sort -nr | grep -o 1` - check="1" - _contains () {{ # Check if space-separated list $1 contains line $2 - echo "$1" | tr ' ' '\\n' | grep -F -x -q "$2" - }} - if _contains "${{check}}" "${{search_list}}"; then - echo "There is only one sample per group, which is not allowed in DESEQ2" - touch {output.results_auc} - touch {output.html_auc} - touch {output.elbowlimits_auc} - touch {output.results_frag} - touch {output.html_frag} - touch {output.elbowlimits_frag} - touch {output.pdf} - else - Rscript {params.rscript_diff} \\ - --rmd {params.rmd} \\ - --carlisle_functions {params.carlisle_functions} \\ - --Rlib_dir {params.Rlib_dir} \\ - --Rpkg_config {params.Rpkg_config} \\ - --countsmatrix {input.cm_auc} \\ - --sampleinfo {input.si} \\ - --dupstatus {params.dupstatus} \\ - --condition1 ${{condition1}} \\ - --condition2 ${{condition2}} \\ - --fdr_cutoff {params.fdr_cutoff} \\ - --log2fc_cutoff {params.log2fc_cutoff} \\ - --results {output.results_auc} \\ - --report {output.html_auc} \\ - --elbowlimits {output.elbowlimits_auc} \\ - --spiked {params.spiked} \\ - --rawcountsprescaled \\ - --scalesfbymean \\ - --contrast_data {input.contrast_data} \\ - --tmpdir $TMPDIR_AUC \\ - --species {params.species} \\ - --gtf {params.gtf} + Rscript {params.rscript_diff} \\ + --rmd {params.rmd} \\ + --carlisle_functions {params.carlisle_functions} \\ + --Rlib_dir {params.Rlib_dir} \\ + --Rpkg_config {params.Rpkg_config} \\ + --countsmatrix {input.cm_auc} \\ + --sampleinfo {input.si} \\ + --dupstatus {params.dupstatus} \\ + --condition1 ${{condition1}} \\ + --condition2 ${{condition2}} \\ + --fdr_cutoff {params.fdr_cutoff} \\ + --log2fc_cutoff {params.log2fc_cutoff} \\ + --results {output.results_auc} \\ + --report {output.html_auc} \\ + --elbowlimits {output.elbowlimits_auc} \\ + --spiked {params.spiked} \\ + --rawcountsprescaled \\ + --scalesfbymean \\ + --contrast_data {input.contrast_data} \\ + --tmpdir $TMPDIR_AUC \\ + --species {params.species} \\ + --gtf {params.gtf} - # change elbow limits to provided log2fc if limit is set to .na.real - sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc} - sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc} + # change elbow limits to provided log2fc if limit is set to .na.real + sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc} + sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc} - ## Run FRAG method - mkdir -p ${{TMPDIR_FRAG}} - mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir - mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir - cd $TMPDIR_FRAG + ## Run FRAG method + mkdir -p ${{TMPDIR_FRAG}} + mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir + mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir + cd $TMPDIR_FRAG - # Do not use --rawcountsprescaled as these counts are not prescaled! - Rscript {params.rscript_diff} \\ - --rmd {params.rmd} \\ - --carlisle_functions {params.carlisle_functions} \\ - --Rlib_dir {params.Rlib_dir} \\ - --Rpkg_config {params.Rpkg_config} \\ - --countsmatrix {input.cm_frag} \\ - --sampleinfo {input.si} \\ - --dupstatus {params.dupstatus} \\ - --condition1 ${{condition1}} \\ - --condition2 ${{condition2}} \\ - --fdr_cutoff {params.fdr_cutoff} \\ - --log2fc_cutoff {params.log2fc_cutoff} \\ - --results {output.results_frag} \\ - --report {output.html_frag} \\ - --elbowlimits {output.elbowlimits_frag} \\ - --spiked {params.spiked} \\ - --scalesfbymean \\ - --contrast_data {input.contrast_data} \\ - --tmpdir $TMPDIR_FRAG \\ - --species {params.species} \\ - --gtf {params.gtf} + # Do not use --rawcountsprescaled as these counts are not prescaled! + Rscript {params.rscript_diff} \\ + --rmd {params.rmd} \\ + --carlisle_functions {params.carlisle_functions} \\ + --Rlib_dir {params.Rlib_dir} \\ + --Rpkg_config {params.Rpkg_config} \\ + --countsmatrix {input.cm_frag} \\ + --sampleinfo {input.si} \\ + --dupstatus {params.dupstatus} \\ + --condition1 ${{condition1}} \\ + --condition2 ${{condition2}} \\ + --fdr_cutoff {params.fdr_cutoff} \\ + --log2fc_cutoff {params.log2fc_cutoff} \\ + --results {output.results_frag} \\ + --report {output.html_frag} \\ + --elbowlimits {output.elbowlimits_frag} \\ + --spiked {params.spiked} \\ + --scalesfbymean \\ + --contrast_data {input.contrast_data} \\ + --tmpdir $TMPDIR_FRAG \\ + --species {params.species} \\ + --gtf {params.gtf} - # change elbow limits to provided log2fc if limit is set to .na.real - sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag} - sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag} + # change elbow limits to provided log2fc if limit is set to .na.real + sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag} + sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag} - ## Run venns - mkdir -p ${{TMPDIR_VENN}} - mkdir -p ${{TMPDIR_VENN}}/intermediates_dir - mkdir -p ${{TMPDIR_VENN}}/knit_root_dir - cd $TMPDIR_VENN - Rscript {params.rscript_venn} \\ - --aucresults {output.results_auc} \\ - --fragmentsresults {output.results_frag} \\ - --pdf {output.pdf} \\ - --title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}" - fi - """ + ## Run venns + mkdir -p ${{TMPDIR_VENN}} + mkdir -p ${{TMPDIR_VENN}}/intermediates_dir + mkdir -p ${{TMPDIR_VENN}}/knit_root_dir + cd $TMPDIR_VENN + Rscript {params.rscript_venn} \\ + --aucresults {output.results_auc} \\ + --fragmentsresults {output.results_frag} \\ + --pdf {output.pdf} \\ + --title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}" + fi + """ rule diffbb: input: @@ -344,4 +326,4 @@ rule diffbb: bedToBigBed -type=bed9 {output.fbed} {input.genome_len} {output.fbigbed} fi - """ \ No newline at end of file + """ From eb7b4cda92836a3155f26c872a5fca67813febfb Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 10 Apr 2024 16:00:53 -0400 Subject: [PATCH 2/8] Fix bug in library normalization with no_dedup option --- workflow/scripts/_get_nreads_stats.py | 6 +++--- workflow/scripts/_make_alignment_stats_table.R | 4 ++-- workflow/scripts/_make_library_norm_table.R | 7 ++++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/workflow/scripts/_get_nreads_stats.py b/workflow/scripts/_get_nreads_stats.py index 5e54b53..938506e 100644 --- a/workflow/scripts/_get_nreads_stats.py +++ b/workflow/scripts/_get_nreads_stats.py @@ -26,8 +26,8 @@ stats["raw_nreads_genome"] = sum(list(raw_idxstats[raw_idxstats[0].isin(list(genomelen[0]))][2])) stats["raw_nreads_spikein"] = sum(list(raw_idxstats[raw_idxstats[0].isin(list(spikeinlen[0]))][2])) -stats["nodedup_nreads_genome"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(genomelen[0]))][2])) -stats["nodedup_nreads_spikein"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(spikeinlen[0]))][2])) +stats["no_dedup_nreads_genome"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(genomelen[0]))][2])) +stats["no_dedup_nreads_spikein"] = sum(list(nodedup_idxstats[nodedup_idxstats[0].isin(list(spikeinlen[0]))][2])) stats["dedup_nreads_genome"] = sum(list(dedup_idxstats[dedup_idxstats[0].isin(list(genomelen[0]))][2])) stats["dedup_nreads_spikein"] = sum(list(dedup_idxstats[dedup_idxstats[0].isin(list(spikeinlen[0]))][2])) @@ -41,4 +41,4 @@ stats["nreads"] = int(npairs*2) with open(sys.argv[7], 'w') as file: - dumped = yaml.dump(stats, file) \ No newline at end of file + dumped = yaml.dump(stats, file) diff --git a/workflow/scripts/_make_alignment_stats_table.R b/workflow/scripts/_make_alignment_stats_table.R index 7c24b99..b17f9c8 100644 --- a/workflow/scripts/_make_alignment_stats_table.R +++ b/workflow/scripts/_make_alignment_stats_table.R @@ -51,8 +51,8 @@ column_order = c("sample_name", "scaling_factor", "duplication_rate_genome", "duplication_rate_spikein", - "nodedup_nreads_genome", - "nodedup_nreads_spikein", + "no_dedup_nreads_genome", + "no_dedup_nreads_spikein", "dedup_nreads_genome", "dedup_nreads_spikein") diff --git a/workflow/scripts/_make_library_norm_table.R b/workflow/scripts/_make_library_norm_table.R index bb185de..7279e87 100644 --- a/workflow/scripts/_make_library_norm_table.R +++ b/workflow/scripts/_make_library_norm_table.R @@ -46,7 +46,7 @@ for ( f in files ){ # run once for dedup, once for nondedup final_df=data.frame() -for (dedup_type in c("dedup_nreads_genome","nodedup_nreads_genome")){ +for (dedup_type in c("dedup_nreads_genome","no_dedup_nreads_genome")){ # determine scaling factor dependent on library size col_median=median(df[,dedup_type]) if (col_median>1000000000){ @@ -70,10 +70,11 @@ for (dedup_type in c("dedup_nreads_genome","nodedup_nreads_genome")){ df$library_size=df[,dedup_type]/lib_factor df$sampleid=rownames(df) df$dedup_type=strsplit(dedup_type,"_")[[1]][1] - + df$dedup_type=gsub("no","no_dedup",df$dedup_type) + # create final df select_cols=c("sampleid","library_size","dedup_type") final_df=rbind(final_df,df[,select_cols]) } -write.table(final_df,out_table,row.names = FALSE) \ No newline at end of file +write.table(final_df,out_table,row.names = FALSE) From 897bef7b36e1ce54a9e86cd26a016ad26eda3b4c Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 10 Apr 2024 16:04:18 -0400 Subject: [PATCH 3/8] Fix bug in sample table replicate processing --- workflow/rules/init.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk index 6dbde8b..8aa464b 100644 --- a/workflow/rules/init.smk +++ b/workflow/rules/init.smk @@ -117,10 +117,10 @@ TREATMENT_WITHOUTCONTROL_LIST=[] TREAT_to_CONTRL_DICT=dict() for i,t in enumerate(list(df[df['isControl']=="N"]['replicateName'].unique())): crow=df[df['replicateName']==t].iloc[0] - c=crow.controlName+"_"+str(crow.controlReplicateNumber) + c=crow.controlName+"_"+str(int(crow.controlReplicateNumber)) if not c in REPLICATES: print("# Control NOT found for sampleName_replicateNumber:"+t) - print("# "+config["samplemanifest"]+" has no entry for sample:"+crow.controlName+" replicateNumber:"+crow.controlReplicateNumber) + print("# "+config["samplemanifest"]+" has no entry for sample:"+crow.controlName+" replicateNumber:"+str(crow.controlReplicateNumber)) exit() print("## "+str(i+1)+") "+t+" "+c) process_replicates.extend([t,c]) @@ -453,4 +453,4 @@ rule create_reference: mv $(dirname {output.bt2})/tmp/ref.yaml {output.refjson} fi - """ \ No newline at end of file + """ From a2a60eef26f890268acc325fb88b35f0aaec30a7 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 10 Apr 2024 16:06:06 -0400 Subject: [PATCH 4/8] Allow local library installation --- workflow/scripts/_diff_markdown.Rmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/workflow/scripts/_diff_markdown.Rmd b/workflow/scripts/_diff_markdown.Rmd index ecdcfc5..7422f3b 100644 --- a/workflow/scripts/_diff_markdown.Rmd +++ b/workflow/scripts/_diff_markdown.Rmd @@ -39,7 +39,7 @@ source(params$carlisle_functions) Rlib_dir=params$Rlib_dir Rpkg_config=params$Rpkg_config print(paste0("Using the lib.loc location: ",Rlib_dir)) -assign(".lib.loc", Rlib_dir, envir = environment(.libPaths)) +#assign(".lib.loc", Rlib_dir, envir = environment(.libPaths)) .libPaths() # read in package info @@ -51,11 +51,11 @@ pkg_df CARLISLE_HANDLE_PACKAGES(pkg_df) ##handle ELBOW separately - it requires an older version of bioconductor / R -if("ELBOW" %in% new.packages){ - install.packages(paste0(Rlib_dir,"ELBOW_1.10.0.tar.gz"), - repos = NULL, type="source", INSTALL_opts = '--no-lock') - new.packages=new.packages[names(new.packages) != "ELBOW"] -} +#if("ELBOW" %in% new.packages){ +# install.packages(paste0(Rlib_dir,"ELBOW_1.10.0.tar.gz"), +# repos = NULL, type="source", INSTALL_opts = '--no-lock') +# new.packages=new.packages[names(new.packages) != "ELBOW"] +#} ``` ## Loading SampleInfo and Counts @@ -394,4 +394,4 @@ EnhancedVolcano(results_df, ```{r resultstable,echo=FALSE,include=TRUE} DT::datatable(results_df) -``` \ No newline at end of file +``` From d4c7a280d065a23d7690a8e8afc26b08677de8bb Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 10 Apr 2024 16:07:05 -0400 Subject: [PATCH 5/8] Fix bug reading FDR threshold instead of log2FC threshold --- workflow/scripts/_diff_markdown_wrapper.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/_diff_markdown_wrapper.R b/workflow/scripts/_diff_markdown_wrapper.R index c6dee47..6559f5f 100644 --- a/workflow/scripts/_diff_markdown_wrapper.R +++ b/workflow/scripts/_diff_markdown_wrapper.R @@ -87,7 +87,7 @@ if (debug){ condition1=args$condition1 condition2=args$condition2 fdr_cutoff=args$fdr_cutoff - log2fc_cutoff=args$fdr_cutoff + log2fc_cutoff=args$log2fc_cutoff indexcols="peakID" results=args$results report=args$report @@ -128,4 +128,4 @@ rmarkdown::render(args$rmd, params=parameters, output_file = report, intermediates_dir = paste(tmpdir,"intermediates_dir",sep="/"), - knit_root_dir = paste(tmpdir,"knit_root_dir",sep="/")) \ No newline at end of file + knit_root_dir = paste(tmpdir,"knit_root_dir",sep="/")) From d2af0b2a4bdf68b31fff9f15e2736103edb3f0dc Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 10 Apr 2024 16:08:52 -0400 Subject: [PATCH 6/8] Added DESeq2 resources --- resources/cluster_biowulf.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/resources/cluster_biowulf.yaml b/resources/cluster_biowulf.yaml index 9711767..746e83a 100644 --- a/resources/cluster_biowulf.yaml +++ b/resources/cluster_biowulf.yaml @@ -73,3 +73,8 @@ deeptools_heatmap: mem: 30g time: 01-00:00:00 ################################################################### +DESeq: + mem: 80g + time: 00-02:00:00 + threads: 2 +################################################################### From 50ae79cb87cef939897f5b9c1c2a5bd8763f1874 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 10 Apr 2024 16:18:26 -0400 Subject: [PATCH 7/8] Fixed typo in rule DESeq --- workflow/rules/diff.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/rules/diff.smk b/workflow/rules/diff.smk index 1e5ab24..d0152e8 100644 --- a/workflow/rules/diff.smk +++ b/workflow/rules/diff.smk @@ -264,8 +264,7 @@ rule DESeq: --fragmentsresults {output.results_frag} \\ --pdf {output.pdf} \\ --title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}" - fi - """ + """ rule diffbb: input: From 464bbed76ebca1e2477828ed05e5c249bd7c690c Mon Sep 17 00:00:00 2001 From: epehrsson Date: Mon, 15 Apr 2024 12:02:13 -0400 Subject: [PATCH 8/8] Updated changelog and docs; reverted R package installation --- CHANGELOG.md | 6 ++++++ docs/user-guide/preparing-files.md | 2 ++ workflow/scripts/_diff_markdown.Rmd | 12 ++++++------ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 633b69a..55aee68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,10 @@ ## CARLISLE development version +- Bug fixes (#127, @epehrsson) +- Removes single-sample group check for DESeq. +- Increases memory for DESeq. +- Ensures control replicate number is an integer. +- Fixes FDR cutoff misassigned to log2FC cutoff. +- Fixes `no_dedup` variable names in library normalization scripts. ## CARLISLE v2.5.0 - Refactors R packages to a common source location (#118, @slsevilla) diff --git a/docs/user-guide/preparing-files.md b/docs/user-guide/preparing-files.md index d31f624..18f3d7c 100644 --- a/docs/user-guide/preparing-files.md +++ b/docs/user-guide/preparing-files.md @@ -158,3 +158,5 @@ An example contrast file: | condition1 | condition2 | | --- | --- | | MOC1_siSmyd3_2m_25_HCHO | MOC1_siNC_2m_25_HCHO | + +**Note**: you must have more than one sample per condition in order to perform differential analysis with DESeq2 diff --git a/workflow/scripts/_diff_markdown.Rmd b/workflow/scripts/_diff_markdown.Rmd index 7422f3b..e4685ef 100644 --- a/workflow/scripts/_diff_markdown.Rmd +++ b/workflow/scripts/_diff_markdown.Rmd @@ -39,7 +39,7 @@ source(params$carlisle_functions) Rlib_dir=params$Rlib_dir Rpkg_config=params$Rpkg_config print(paste0("Using the lib.loc location: ",Rlib_dir)) -#assign(".lib.loc", Rlib_dir, envir = environment(.libPaths)) +assign(".lib.loc", Rlib_dir, envir = environment(.libPaths)) .libPaths() # read in package info @@ -51,11 +51,11 @@ pkg_df CARLISLE_HANDLE_PACKAGES(pkg_df) ##handle ELBOW separately - it requires an older version of bioconductor / R -#if("ELBOW" %in% new.packages){ -# install.packages(paste0(Rlib_dir,"ELBOW_1.10.0.tar.gz"), -# repos = NULL, type="source", INSTALL_opts = '--no-lock') -# new.packages=new.packages[names(new.packages) != "ELBOW"] -#} +if("ELBOW" %in% new.packages){ + install.packages(paste0(Rlib_dir,"ELBOW_1.10.0.tar.gz"), + repos = NULL, type="source", INSTALL_opts = '--no-lock') + new.packages=new.packages[names(new.packages) != "ELBOW"] +} ``` ## Loading SampleInfo and Counts