-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
98 lines (82 loc) · 4.72 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# CSV file with single column, patient, with every patient ID
patients: patients.csv
# CSV file with info on each sample/read group
units: units.csv
# Sequencing Type
# For whole exome, specify WES
# For whole genome, specify WGS
# If you have both WES and WGS samples, process them separately with 2 versions of this pipeline
# This is needed as the data types require different computational resources and genomic regions to interrogate
sequencing_type: WES
# SLURM Log File
# Set directory to save slurm log files (if using cluster execution)
# This can be an absolute or relative path
# If the directory doesn't exist the pipeline will create it
slurm_log_dir: slurm-logs
# Genomic Regions
# Uncompressed BED (.bed) file with regions for variant calling. Must be specified!
# For WES data, coverage is evaluated based on reads in these regions
# For WGS data, coverage is evaluated across the genome in 500 bp windows
genomic_regions: /ihome/alee/fangyuan/projects/refgenome/hg38/IDT_Exome_V1_hg38.bed
# Reference genome sources
# Directory where reference fasta and supporting files are stored
ref_dir: /ihome/alee/fangyuan/projects/refgenome/hg38/
# Name of reference FASTA (should be unzipped)
ref_fasta: /ihome/alee/fangyuan/projects/refgenome/hg38/Homo_sapiens_assembly38.fasta # https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/
# Comma separated names of VCF files with known mutations.
# Should all be in the reference directory
known_sites: /ihome/alee/fangyuan/projects/refgenome/hg38/Homo_sapiens_assembly38.dbsnp138.vcf, /ihome/alee/fangyuan/projects/refgenome/hg38/Homo_sapiens_assembly38.known_indels.vcf.gz, /ihome/alee/fangyuan/projects/refgenome/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz # https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/, rec by https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle from https://github.com/tjbencomo/ngs-pipeline
# AF only VCF with common allele frequencies - use gnomAD file
germline_resource: /ihome/alee/fangyuan/projects/refgenome/hg38/gnomAD/af-only-gnomad.hg38.vcf.gz # gsutil cp gs://gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz ./
# Biallelic variants only VCF with AF data - use EXAC file
contamination_resource: /ihome/alee/fangyuan/projects/refgenome/hg38/gnomAD/small_exac_common_3.hg38.vcf.gz # module load gsutil/5.16; gsutil cp gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz ./; gsutil cp gs://gatk-best-practices/somatic-hg38/small_exac_common_3.hg38.vcf.gz.tbi ./
# GATK TMP directory
# Large files may overwhelm the default tmp directory (/tmp)
# Set this variable to an alternative filepath that has larger storage (ie $SCRATCH) if
# GATK is running out of storage during a run. Ideally this is fast storage
tmp_dir: /ihome/alee/fangyuan/projects/pdo/tmp/
# Mutect2 Settings
# Number of Mutect2 workers per sample
num_workers: 40
# Variant calling without matched normal
# Only advised as a last resort - try to get matched normal samples
tumor_only: false
# Additional flags for Mutect2
mutect2_flags: None
# Additional flags for FilterMutectCalls
filter_flags: --min-reads-per-strand 1
# Panel of Normal (PON)
# Use PON with Mutect2 for variant calling
use_pon: true
# Filepath for PON. To create a new PON from all the normal samples, specify as None
pon_vcf: None
# Apply additional thresholds to filter variants
# When set to true, requires each SNV has a minimum of
# 15 reads covering the position and at least 5 reads
# with the mutant allele.
# When false, only use FilterMutectCalls criteria
stringent_filtering: false
# Annotation data sources
# Directory with VEP annotation data
vep_dir: /bgfs/alee/LO_LAB/Personal/chelsea/projects/ensembl-vep/cache104
# Assembly version name
assembly_version: GRCh38
# Value for Center field in the combined MAF
center_name: Lee Lab
# Alternate Isoforms For vcf2maf Selection. Set as None to use Ensembl defaults
alternate_isoforms: None
## Singularity Containers
## This specifies the container to use for various steps in the pipeline
## The value can be any string (either local path or url) that can be queried using singularity pull
gatk_container: docker://tbencomo/gatk-bwa-samtools
multiqc_container: docker://ewels/multiqc:v1.11
mosdepth_container: docker://tbencomo/mosdepth
fastqc_container: docker://biocontainers/fastqc:v0.11.9_cv8
vep_container: docker://tbencomo/vep-vcf2maf
eda_container: docker://tbencomo/eda
# singularity pull docker://tbencomo/gatk-bwa-samtools
# singularity pull docker://ewels/multiqc:v1.11
# singularity pull docker://tbencomo/mosdepth
# singularity pull docker://biocontainers/fastqc:v0.11.9_cv8
# singularity pull docker://tbencomo/vep-vcf2maf
# singularity pull docker://tbencomo/eda