Skip to content

Commit

Permalink
Merge branch 'setup'
Browse files Browse the repository at this point in the history
  • Loading branch information
danilotat committed Sep 12, 2024
2 parents 3f33407 + 24e62e1 commit 5a48b46
Show file tree
Hide file tree
Showing 13 changed files with 578 additions and 134 deletions.
217 changes: 95 additions & 122 deletions config/config_main.yaml
Original file line number Diff line number Diff line change
@@ -1,143 +1,116 @@
# Adjust parameter for custom analysis

# -- Temp directory for gatk -- #
TEMP_DIR: "temp_gatk"
slurm_log_dir: slurm-logs

# -- Base paths -- #

OUTPUT_FOLDER : "ENEO_output/"

# -- Relative paths to concatenate -- #
# -- OUTPUT STRUCTURE -- #
OUTPUT_FOLDER: ENEO_output/
TEMP_DIR: temp_gatk
datadirs:
index_folder: "genome_index"
salmon_idx: "salmon_index"
trimmed_reads: "trimmed_reads"
trimming_report: "fastp_report"
mapped_reads: "mapped_reads"
salmon_quant: "quantification"
expression: "expression_data"
bams: "bams"
utils: "utils"
HLA_typing: "HLA_typing"
BQSR: "BQSR"
VCF: "VCF"
VCF_germ: "VCF_germ"
VCF_out: "VCF_out"
peptides: "peptides"
BQSR: BQSR
HLA_typing: HLA_typing
VCF: VCF
VCF_germ: VCF_germ
VCF_out: VCF_out
bams: bams
expression: expression_data
index_folder: genome_index
logs:
align: "log/align"
annotate_variants: "log/annotate_variants"
bam_cleaning: "log/bam_cleaning"
bam_readcount: "log/bam_readcount"
base_recalibration: "log/base_recalibration"
decompose: "log/decompose"
export_quant: "log/export_quant"
intervals: "log/intervals"
pMHC: "log/pMHC"
salmon_quant: "log/salmon_quant"
snv_calling: "log/snv_calling"
sort_bam: "log/sort_bam"
star_idx: "log/star_idx"
t1k: "log/t1k"
trimming: "log/trimming"




# -- RESOURCES -- #
# Despite the possibility to put all of them inside a single folder, the choice to write them down
# explicitely is intentional. That's because a bioinformatic research group could already have many
# of these files on their server in different locations, so it's useless to move them in a single
# folder breaking existing practices e/o workflows.

resources:
# -- references -- #
genome: "test_data/genome_chr6.fa.gz"
transcriptome: "test_data/chr6_cdna.fa.gz"
gtf: "test_data/chr6_105.gtf"
# -- vcfs -- #
gnomad: "test_data/gnomad_chr6.vcf.gz"
gsnps: "test_data/1000G_snsp_chr6.vcf.gz"
dbsnps: "test_data/dbsnpALFA_chr6.vcf.gz"
REDI: "test_data/REDI_chr6.BED.gz"
small_exac: "test_data/exac_chr6.vcf.gz"
indel: "test_data/indels_chr6.vcf.gz"
cosmic: "test_data/cosmic_chr6.vcf.gz"
# -- other -- #
# NOTE: for the sake of github testing, we're not using vep cache and vep plugin, referring
# only to online for annotation. This doesn't scale at the whole VCF size, requiring the
# local copy of the vep cache. Refer to the documentation for the full setup
hla_script: "workflow/scripts/HLA_typing.py"
germline_prob_script: "workflow/scripts/germProb.py"
toml_script: "workflow/scripts/createTOML.py"
intervals_coding: "workflow/supplementary_res/intervals_coding.BED.gz"
vcfanno_binary: "workflow/utils/vcfanno_linux64"
vcfanno_toml: "workflow/utils/vcfanno.toml"
vcfanno_lua: "workflow/utils/custom.lua"
t1k_file: "workflow/supplementary_res/hlaidx_rna_seq.fa"
giab_intervals: "test_data/GIAB_chr6.bed.gz"
# -- TOOLS PARAMS -- #
# moved from a tool centered to a resource type one
align: log/align
annotate_variants: log/annotate_variants
bam_cleaning: log/bam_cleaning
bam_readcount: log/bam_readcount
base_recalibration: log/base_recalibration
decompose: log/decompose
export_quant: log/export_quant
intervals: log/intervals
pMHC: log/pMHC
salmon_quant: log/salmon_quant
snv_calling: log/snv_calling
sort_bam: log/sort_bam
star_idx: log/star_idx
t1k: log/t1k
trimming: log/trimming
mapped_reads: mapped_reads
peptides: peptides
salmon_idx: salmon_index
salmon_quant: quantification
trimmed_reads: trimmed_reads
trimming_report: fastp_report
utils: utils
params:
STAR:
threads: 12
RAM:
extra: "--twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat "
t1k:
threads: 8
RAM:
extra:
salmon:
threads: 8
RAM:
extra:
index: "--keep-duplicates"
libtype: "A"
zip_ext: "gz"
extra: "--gcBias --seqBias --reduceGCMemory"
samtools:
BQSR:
RAM: 30000
extra: ''
threads: 4
RAM:
extra: ""
MarkDuplicates:
threads: 4
RAM: 30000
extra: ""
SplitNCigarReads:
extra: ''
threads: 4
STAR:
RAM: null
extra: '--twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat '
threads: 12
SplitNCigarReads:
RAM: 30000
extra: ""
BQSR:
extra: ''
threads: 4
RAM: 30000
extra: ""
gatk:
RAM: 20
RAM: 20
extra:
RGPU: unit1
RGSM: 20
pMHC:
threads: 4
pvacseq:
RAM: null
extra: null
threads: 2
salmon:
RAM: null
extra:
extra: --gcBias --seqBias --reduceGCMemory
index: --keep-duplicates
libtype: A
zip_ext: gz
threads: 8
samtools:
RAM: null
extra: ''
threads: 4
strelka2:
RAM: null
extra: null
threads: 8
t1k:
RAM: null
extra: null
threads: 8
RAM:
extra:
vcfanno:
RAM: null
extra: null
threads: 8
RAM:
extra:
pvacseq:
threads: 2
RAM:
extra:
vep:
threads:
RAM:
RAM: null
extra:
assembly: "GRCh38"
filtering: "--gencode_basic --coding_only --no_intergenic"
assembly: GRCh38
filtering: --gencode_basic --coding_only --no_intergenic
plugins:
Wildtype: workflow/utils/vep_plugins/Wildtype.pm
Frameshift: workflow/utils/vep_plugins/Frameshift.pm
pMHC:
threads: 4
Wildtype: workflow/utils/vep_plugins/Wildtype.pm
threads: null
resources:
cosmic: test_data/cosmic_chr6.vcf.gz
dbsnps: null
genome: null
germline_prob_script: workflow/scripts/germProb.py
giab_intervals: workflow/supplementary_res/GRCh38_giab_merged.bed.gz
gnomad: null
gsnps: null
gtf: null
hla_script: workflow/scripts/HLA_typing.py
indel: null
intervals_coding: workflow/supplementary_res/intervals_coding.BED.gz
REDI: null
t1k_file: workflow/supplementary_res/hlaidx_rna_seq.fa
toml_script: workflow/scripts/createTOML.py
transcriptome: null
vep_cache: null
vcfanno_binary: workflow/utils/vcfanno_linux64
vcfanno_lua: workflow/utils/custom.lua
vcfanno_toml: workflow/utils/vcfanno.toml
slurm_log_dir: slurm-logs
26 changes: 26 additions & 0 deletions docs/hpc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Run on HPC

ENEO was developed and tested in High Performances Computing (HPC) clusters with the SLURM workload manager. Even if Snakemake introduced plugins in version `>8.0`, still the preferred way to launch the workload in SLURM is using a defined profile.

Insert the account and partition inside `workflow/profile/slurm_profile/config.yaml` and any other additional flags required for submitting jobs on the HPC platform in use.

``` yaml
cluster:
mkdir -p slurm-logs/{rule} &&
sbatch
--cpus-per-task={resources.ncpus}
--mem={resources.mem}
--time={resources.time}
--job-name=smk-{rule}-{wildcards}
--output=slurm-logs/{rule}/{rule}-{wildcards}-%j.out
--partition=<partitionhere>
--account=<accounthere>
```

This will create a folder called `slurm-logs` with a subfolder for each rule, where each patient will have a different log file.

Then execute the pipeline by just

```
snakemake --profile workflow/profile/slurm_profile
```
20 changes: 20 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Introduction

ENEO is a Snakemake workflow developed for the identification of cancer neoantigens using solely the tumor RNAseq, without requiring matched controls or additional sequencing experiments. You could read more from the preprint [here](https://www.biorxiv.org/content/10.1101/2024.08.08.607127v1).


## Quick Start

To start, clone the repo using

```
git clone https://github.com/ctglab/ENEO.git
```

To execute the pipeline, be sure to have [snakemake](https://snakemake.readthedocs.io/en/stable/) and [singularity](https://docs.sylabs.io/guides/3.1/user-guide/index.html) installed. Then execute the pipeline using

```
snakemake --use-singularity --use-conda --cores 4
```

If you spot any issue, please report in the github issue section https://github.com/ctglab/ENEO/issues
74 changes: 74 additions & 0 deletions docs/resources.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Setup resources

ENEO heavily depends on public genetic databases for germline probability estimation. It's required to download data from multiple resources and convert them in order to have concordant annotations (i.e. Ensembl).

## Automatic setup
On working

![](https://imgs.xkcd.com/comics/automation.png)

## Manual setup

To prepare input files, it's required to have an environment with these tools installed

- bedtools
- bcftools
- tabix
- gatk4

The easiest way is to create a single conda environment as

```
conda create --name eneo_setup -c bioconda -c conda-forge bedtools bcftools tabix gatk4
```


### Genome and Transcriptome files

Given the use of Ensembl annotation, files could be downloaded from the Ensembl ftp site

**Genome**:
```
wget https://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
gatk CreateSequenceDictionary -R Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz -O Homo_sapiens.GRCh38.dna.primary_assembly.fa.dict
```
**Transcriptome**
```
wget https://ftp.ensembl.org/pub/current/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
```

### Genetic population resources

ENEO uses multiple databases to infer germline likelihood of candidate variants. Most of them uses different chromosome naming, so you need to convert them. A conversion table is available at

#### dbSNP

```bash
wget -c https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/All_20180418.vcf.gz
wget -c https://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/All_20180418.vcf.gz.tbi
```

#### 1000G

```bash
wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz
wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi
```
#### known indels

```bash
wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz
wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi
```

### gnomAD

```bash
wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/somatic-hg38/af-only-gnomad.hg38.vcf.gz
wget https://storage.googleapis.com/gcp-public-data--broad-references/hg38/v0/somatic-hg38/af-only-gnomad.hg38.vcf.gz.tbi
```





Loading

0 comments on commit 5a48b46

Please sign in to comment.