config.yaml

# CSV file with single column, patient, with every patient ID
patients: patients.csv
# CSV file with info on each sample/read group
units: units.csv

# Sequencing Type
# For whole exome, specify WES
# For whole genome, specify WGS
# If you have both WES and WGS samples, process them separately with 2 versions of this pipeline
# This is needed as the data types require different computational resources and genomic regions to interrogate
sequencing_type: WES

# SLURM Log File
# Set directory to save slurm log files (if using cluster execution)
# This can be an absolute or relative path
# If the directory doesn't exist the pipeline will create it
slurm_log_dir: slurm-logs


# Genomic Regions
# Uncompressed BED (.bed) file with regions for variant calling. Must be specified!
# For WES data, coverage is evaluated based on reads in these regions
# For WGS data, coverage is evaluated across the genome in 500 bp windows
genomic_regions: /ihome/alee/fangyuan/projects/refgenome/hg38/IDT_Exome_V1_hg38.bed

# Reference genome sources
# Directory where reference fasta and supporting files are stored
ref_dir: /ihome/alee/fangyuan/projects/refgenome/hg38/
# Name of reference FASTA (should be unzipped)
ref_fasta: /ihome/alee/fangyuan/projects/refgenome/hg38/Homo_sapiens_assembly38.fasta #  https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/
# Comma separated names of VCF files with known mutations. 
# Should all be in the reference directory
known_sites: /ihome/alee/fangyuan/projects/refgenome/hg38/Homo_sapiens_assembly38.dbsnp138.vcf, /ihome/alee/fangyuan/projects/refgenome/hg38/Homo_sapiens_assembly38.known_indels.vcf.gz, /ihome/alee/fangyuan/projects/refgenome/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz # https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/, rec by https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle from https://github.com/tjbencomo/ngs-pipeline
# AF only VCF with common allele frequencies - use gnomAD file
germline_resource: /ihome/alee/fangyuan/projects/refgenome/hg38/gnomAD/af-only-gnomad.hg38.vcf.gz # gsutil cp gs://gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz ./
# Biallelic variants only VCF with AF data - use EXAC file
contamination_resource: /ihome/alee/fangyuan/projects/refgenome/hg38/gnomAD/small_exac_common_3.hg38.vcf.gz # module load gsutil/5.16; gsutil cp gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz ./; gsutil cp gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz.tbi ./


# GATK TMP directory
# Large files may overwhelm the default tmp directory (/tmp)
# Set this variable to an alternative filepath that has larger storage (ie $SCRATCH) if
# GATK is running out of storage during a run. Ideally this is fast storage
tmp_dir: /ihome/alee/fangyuan/projects/pdo/tmp/


# Mutect2 Settings
# Number of Mutect2 workers per sample
num_workers: 40
# Variant calling without matched normal
# Only advised as a last resort - try to get matched normal samples
tumor_only: false
# Additional flags for Mutect2
mutect2_flags: None
# Additional flags for FilterMutectCalls
filter_flags: --min-reads-per-strand 1

# Panel of Normal (PON)
# Use PON with Mutect2 for variant calling
use_pon: true
# Filepath for PON. To create a new PON from all the normal samples, specify as None
pon_vcf: None


# Apply additional thresholds to filter variants
# When set to true, requires each SNV has a minimum of
# 15 reads covering the position and at least 5 reads
# with the mutant allele.
# When false, only use FilterMutectCalls criteria
stringent_filtering: false


# Annotation data sources
# Directory with VEP annotation data
vep_dir: /bgfs/alee/LO_LAB/Personal/chelsea/projects/ensembl-vep/cache104
# Assembly version name
assembly_version: GRCh38
# Value for Center field in the combined MAF
center_name: Lee Lab
# Alternate Isoforms For vcf2maf Selection. Set as None to use Ensembl defaults
alternate_isoforms: None

## Singularity Containers
## This specifies the container to use for various steps in the pipeline
## The value can be any string (either local path or url) that can be queried using singularity pull
gatk_container: docker://tbencomo/gatk-bwa-samtools
multiqc_container: docker://ewels/multiqc:v1.11
mosdepth_container: docker://tbencomo/mosdepth
fastqc_container: docker://biocontainers/fastqc:v0.11.9_cv8
vep_container: docker://tbencomo/vep-vcf2maf
eda_container: docker://tbencomo/eda

# singularity pull docker://tbencomo/gatk-bwa-samtools
# singularity pull docker://ewels/multiqc:v1.11
# singularity pull docker://tbencomo/mosdepth
# singularity pull docker://biocontainers/fastqc:v0.11.9_cv8
# singularity pull docker://tbencomo/vep-vcf2maf
# singularity pull docker://tbencomo/eda