Merge branch 'setup'

ctglab · Sep 12, 2024 · 5a48b46 · 5a48b46
2 parents 3f33407 + 24e62e1
commit 5a48b46
Show file tree

Hide file tree

Showing 13 changed files with 578 additions and 134 deletions.
diff --git a/config/config_main.yaml b/config/config_main.yaml
@@ -1,143 +1,116 @@
-# Adjust parameter for custom analysis
-
-# -- Temp directory for gatk -- #
-TEMP_DIR: "temp_gatk"
-slurm_log_dir: slurm-logs
-
-# -- Base paths -- #
-
-OUTPUT_FOLDER : "ENEO_output/"
-
-# -- Relative paths to concatenate -- #
-# -- OUTPUT STRUCTURE -- #
+OUTPUT_FOLDER: ENEO_output/
+TEMP_DIR: temp_gatk
 datadirs:
-  index_folder: "genome_index"
-  salmon_idx: "salmon_index"
-  trimmed_reads: "trimmed_reads"
-  trimming_report: "fastp_report"
-  mapped_reads:  "mapped_reads"
-  salmon_quant: "quantification"
-  expression: "expression_data"
-  bams: "bams"
-  utils: "utils"
-  HLA_typing: "HLA_typing"
-  BQSR:  "BQSR"
-  VCF:  "VCF"
-  VCF_germ: "VCF_germ"
-  VCF_out: "VCF_out"  
-  peptides: "peptides"  
+  BQSR: BQSR
+  HLA_typing: HLA_typing
+  VCF: VCF
+  VCF_germ: VCF_germ
+  VCF_out: VCF_out
+  bams: bams
+  expression: expression_data
+  index_folder: genome_index
   logs:
-    align: "log/align"
-    annotate_variants: "log/annotate_variants"
-    bam_cleaning: "log/bam_cleaning"
-    bam_readcount: "log/bam_readcount"
-    base_recalibration: "log/base_recalibration"
-    decompose: "log/decompose"
-    export_quant: "log/export_quant"
-    intervals: "log/intervals"
-    pMHC: "log/pMHC"
-    salmon_quant: "log/salmon_quant"
-    snv_calling: "log/snv_calling"
-    sort_bam: "log/sort_bam"
-    star_idx:  "log/star_idx"
-    t1k: "log/t1k"
-    trimming: "log/trimming"
-
-
-
-
-# -- RESOURCES -- #
-# Despite the possibility to put all of them inside a single folder, the choice to write them down 
-# explicitely is intentional. That's because a bioinformatic research group could already have many 
-# of these files on their server in different locations, so it's useless to move them in a single 
-# folder breaking existing practices e/o workflows.
-
-resources:
-  # -- references -- #
-  genome: "test_data/genome_chr6.fa.gz"
-  transcriptome: "test_data/chr6_cdna.fa.gz"
-  gtf: "test_data/chr6_105.gtf"
-  # -- vcfs -- #
-  gnomad: "test_data/gnomad_chr6.vcf.gz"
-  gsnps: "test_data/1000G_snsp_chr6.vcf.gz"
-  dbsnps: "test_data/dbsnpALFA_chr6.vcf.gz"
-  REDI: "test_data/REDI_chr6.BED.gz"
-  small_exac: "test_data/exac_chr6.vcf.gz"
-  indel: "test_data/indels_chr6.vcf.gz"
-  cosmic: "test_data/cosmic_chr6.vcf.gz"
-  # -- other -- #
-  # NOTE: for the sake of github testing, we're not using vep cache and vep plugin, referring
-  # only to online for annotation. This doesn't scale at the whole VCF size, requiring the 
-  # local copy of the vep cache. Refer to the documentation for the full setup
-  hla_script: "workflow/scripts/HLA_typing.py"
-  germline_prob_script: "workflow/scripts/germProb.py"
-  toml_script: "workflow/scripts/createTOML.py"
-  intervals_coding: "workflow/supplementary_res/intervals_coding.BED.gz"
-  vcfanno_binary: "workflow/utils/vcfanno_linux64"
-  vcfanno_toml: "workflow/utils/vcfanno.toml"
-  vcfanno_lua: "workflow/utils/custom.lua"
-  t1k_file: "workflow/supplementary_res/hlaidx_rna_seq.fa"
-  giab_intervals: "test_data/GIAB_chr6.bed.gz"
-# -- TOOLS PARAMS -- #
-# moved from a tool centered to a resource type one 
+    align: log/align
+    annotate_variants: log/annotate_variants
+    bam_cleaning: log/bam_cleaning
+    bam_readcount: log/bam_readcount
+    base_recalibration: log/base_recalibration
+    decompose: log/decompose
+    export_quant: log/export_quant
+    intervals: log/intervals
+    pMHC: log/pMHC
+    salmon_quant: log/salmon_quant
+    snv_calling: log/snv_calling
+    sort_bam: log/sort_bam
+    star_idx: log/star_idx
+    t1k: log/t1k
+    trimming: log/trimming
+  mapped_reads: mapped_reads
+  peptides: peptides
+  salmon_idx: salmon_index
+  salmon_quant: quantification
+  trimmed_reads: trimmed_reads
+  trimming_report: fastp_report
+  utils: utils
 params:
-  STAR:
-    threads: 12
-    RAM: 
-    extra: "--twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat " 
-  t1k:
-    threads: 8
-    RAM: 
-    extra: 
-  salmon:
-    threads: 8
-    RAM: 
-    extra:
-      index: "--keep-duplicates"
-      libtype: "A"
-      zip_ext: "gz"
-      extra: "--gcBias --seqBias --reduceGCMemory"
-  samtools:
+  BQSR:
+    RAM: 30000
+    extra: ''
     threads: 4
-    RAM: 
-    extra: ""
   MarkDuplicates:
-    threads: 4
     RAM: 30000
-    extra: ""
-  SplitNCigarReads:
+    extra: ''
     threads: 4
+  STAR:
+    RAM: null
+    extra: '--twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat '
+    threads: 12
+  SplitNCigarReads:
     RAM: 30000
-    extra: ""
-  BQSR:
+    extra: ''
     threads: 4
-    RAM: 30000
-    extra: ""
   gatk:
-    RAM: 20  
+    RAM: 20
     extra:
       RGPU: unit1
       RGSM: 20
+  pMHC:
+    threads: 4
+  pvacseq:
+    RAM: null
+    extra: null
+    threads: 2
+  salmon:
+    RAM: null
+    extra:
+      extra: --gcBias --seqBias --reduceGCMemory
+      index: --keep-duplicates
+      libtype: A
+      zip_ext: gz
+    threads: 8
+  samtools:
+    RAM: null
+    extra: ''
+    threads: 4
   strelka2:
+    RAM: null
+    extra: null
+    threads: 8
+  t1k:
+    RAM: null
+    extra: null
     threads: 8
-    RAM: 
-    extra:
   vcfanno:
+    RAM: null
+    extra: null
     threads: 8
-    RAM: 
-    extra:
-  pvacseq:
-    threads: 2
-    RAM: 
-    extra:
   vep:
-    threads: 
-    RAM: 
+    RAM: null
     extra:
-      assembly: "GRCh38"
-      filtering: "--gencode_basic --coding_only --no_intergenic"
+      assembly: GRCh38
+      filtering: --gencode_basic --coding_only --no_intergenic
       plugins:
-        Wildtype: workflow/utils/vep_plugins/Wildtype.pm
         Frameshift: workflow/utils/vep_plugins/Frameshift.pm
-  pMHC:
-    threads: 4
+        Wildtype: workflow/utils/vep_plugins/Wildtype.pm
+    threads: null
+resources:
+  cosmic: test_data/cosmic_chr6.vcf.gz
+  dbsnps: null
+  genome: null
+  germline_prob_script: workflow/scripts/germProb.py
+  giab_intervals: workflow/supplementary_res/GRCh38_giab_merged.bed.gz
+  gnomad: null
+  gsnps: null
+  gtf: null
+  hla_script: workflow/scripts/HLA_typing.py
+  indel: null
+  intervals_coding: workflow/supplementary_res/intervals_coding.BED.gz
+  REDI: null
+  t1k_file: workflow/supplementary_res/hlaidx_rna_seq.fa
+  toml_script: workflow/scripts/createTOML.py
+  transcriptome: null
+  vep_cache: null
+  vcfanno_binary: workflow/utils/vcfanno_linux64
+  vcfanno_lua: workflow/utils/custom.lua
+  vcfanno_toml: workflow/utils/vcfanno.toml
+slurm_log_dir: slurm-logs
diff --git a/docs/hpc.md b/docs/hpc.md
@@ -0,0 +1,26 @@
+# Run on HPC
+
+ENEO was developed and tested in High Performances Computing (HPC) clusters with the SLURM workload manager. Even if Snakemake introduced plugins in version `>8.0`, still the preferred way to launch the workload in SLURM is using a defined profile.
+
+Insert the account and partition inside `workflow/profile/slurm_profile/config.yaml` and any other additional flags required for submitting jobs on the HPC platform in use. 
+
+``` yaml
+cluster:
+  mkdir -p slurm-logs/{rule} &&
+  sbatch
+    --cpus-per-task={resources.ncpus}
+    --mem={resources.mem}
+    --time={resources.time}
+    --job-name=smk-{rule}-{wildcards}
+    --output=slurm-logs/{rule}/{rule}-{wildcards}-%j.out
+    --partition=<partitionhere>
+    --account=<accounthere>
+```
+
+This will create a folder called `slurm-logs` with a subfolder for each rule, where each patient will have a different log file. 
+
+Then execute the pipeline by just 
+
+```
+snakemake --profile workflow/profile/slurm_profile
+```
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,20 @@
+# Introduction
+
+ENEO is a Snakemake workflow developed for the identification of cancer neoantigens using solely the tumor RNAseq, without requiring matched controls or additional sequencing experiments. You could read more from the preprint [here](https://www.biorxiv.org/content/10.1101/2024.08.08.607127v1).
+
+
+## Quick Start
+
+To start, clone the repo using 
+
+```
+git clone https://github.com/ctglab/ENEO.git
+```
+
+To execute the pipeline, be sure to have [snakemake](https://snakemake.readthedocs.io/en/stable/) and [singularity](https://docs.sylabs.io/guides/3.1/user-guide/index.html) installed. Then execute the pipeline using 
+
+```
+snakemake --use-singularity --use-conda --cores 4
+```
+
+If you spot any issue, please report in the github issue section https://github.com/ctglab/ENEO/issues
diff --git a/docs/resources.md b/docs/resources.md
@@ -0,0 +1,74 @@
+# Setup resources
+
+ENEO heavily depends on public genetic databases for germline probability estimation. It's required to download data from multiple resources and convert them in order to have concordant annotations (i.e. Ensembl).
+
+## Automatic setup
+On working
+
+![](https://imgs.xkcd.com/comics/automation.png)
+
+## Manual setup
+
+To prepare input files, it's required to have an environment with these tools installed 
+
+- bedtools
+- bcftools
+- tabix
+- gatk4
+
+The easiest way is to create a single conda environment as 
+
+```
+conda create --name eneo_setup -c bioconda -c conda-forge bedtools bcftools tabix gatk4
+```
+
+
+### Genome and Transcriptome files
+
+Given the use of Ensembl annotation, files could be downloaded from the Ensembl ftp site
+
+**Genome**:
+```
+wget https://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
+gatk CreateSequenceDictionary -R Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz -O Homo_sapiens.GRCh38.dna.primary_assembly.fa.dict
+```
+**Transcriptome**
+```
+wget https://ftp.ensembl.org/pub/current/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
+```
+
+### Genetic population resources
+
+ENEO uses multiple databases to infer germline likelihood of candidate variants. Most of them uses different chromosome naming, so you need to convert them. A conversion table is available at
+
+#### dbSNP
+
+```bash
+wget -c https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/All_20180418.vcf.gz
+wget -c https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/All_20180418.vcf.gz.tbi
+```
+
+#### 1000G
+
+```bash
+wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz
+wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi
+```
+#### known indels
+
+```bash
+wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz
+wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi
+```
+
+### gnomAD
+
+```bash
+wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/somatic-hg38/af-only-gnomad.hg38.vcf.gz
+wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/somatic-hg38/af-only-gnomad.hg38.vcf.gz.tbi
+```
+
+
+
+
+