From f02ea34bb851967cae985f043baa53b17c8e0d7c Mon Sep 17 00:00:00 2001 From: Panagiotis Moulos Date: Mon, 14 Feb 2022 16:57:31 +0200 Subject: [PATCH] Renaming according to order --- 03-data.sh | 71 ++++++++++++++++++ 05-trimgalore.sh | 68 +++++++++++++++++ 06-refindex.sh | 7 ++ 07-alignment.sh | 39 ++++++++++ 09-bamstats.sh | 86 ++++++++++++++++++++++ 10-signal.sh | 35 +++++++++ 11-bqsr.sh | 119 ++++++++++++++++++++++++++++++ 12-gatk.sh | 176 ++++++++++++++++++++++++++++++++++++++++++++ 13-freebayes.sh | 129 ++++++++++++++++++++++++++++++++ 14-deepvariant.sh | 60 +++++++++++++++ 15-annotation.sh | 130 ++++++++++++++++++++++++++++++++ 16-consolidation.sh | 73 ++++++++++++++++++ 12 files changed, 993 insertions(+) create mode 100755 03-data.sh create mode 100755 05-trimgalore.sh create mode 100755 06-refindex.sh create mode 100755 07-alignment.sh create mode 100755 09-bamstats.sh create mode 100755 10-signal.sh create mode 100755 11-bqsr.sh create mode 100755 12-gatk.sh create mode 100755 13-freebayes.sh create mode 100755 14-deepvariant.sh create mode 100755 15-annotation.sh create mode 100755 16-consolidation.sh diff --git a/03-data.sh b/03-data.sh new file mode 100755 index 0000000..bc7565d --- /dev/null +++ b/03-data.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Setup paths +HOME_PATH=/home/user/analysis +FASTQ_PATH=$HOME_PATH/fastq +mkdir -p $ FASTQ_PATH +cd $CWD + +# Download raw data from 1000 genomes project +cd $FASTQ_PATH + +# HG00119 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099967/SRR099967_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099967/SRR099967_2.fastq.gz +mv SRR099967_1.fastq.gz HG00119_1.fastq.gz +mv SRR099967_2.fastq.gz HG00119_2.fastq.gz + +# HG00133 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099969/SRR099969_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099969/SRR099969_2.fastq.gz +mv SRR099969_1.fastq.gz HG00133_1.fastq.gz +mv SRR099969_2.fastq.gz HG00133_2.fastq.gz + +# HG00145 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099957/SRR099957_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099957/SRR099957_2.fastq.gz +mv SRR099957_1.fastq.gz HG00145_1.fastq.gz +mv SRR099957_2.fastq.gz HG00145_2.fastq.gz + +# HG00239 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099958/SRR099958_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099958/SRR099958_2.fastq.gz +cd $DATA_PATH + +# HG00119 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099967/SRR099967_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099967/SRR099967_2.fastq.gz +mv SRR099967_1.fastq.gz HG00119_1.fastq.gz +mv SRR099967_2.fastq.gz HG00119_2.fastq.gz + +# HG00133 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099969/SRR099969_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099969/SRR099969_2.fastq.gz +mv SRR099969_1.fastq.gz HG00133_1.fastq.gz +mv SRR099969_2.fastq.gz HG00133_2.fastq.gz + +# HG00145 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099957/SRR099957_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099957/SRR099957_2.fastq.gz +mv SRR099957_1.fastq.gz HG00145_1.fastq.gz +mv SRR099957_2.fastq.gz HG00145_2.fastq.gz + +# HG00239 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099958/SRR099958_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099958/SRR099958_2.fastq.gz +mv SRR099958_1.fastq.gz HG00239_1.fastq.gz +mv SRR099958_2.fastq.gz HG00239_2.fastq.gz + +# HG00258 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099954/SRR099954_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099954/SRR099954_2.fastq.gz +mv SRR099954_1.fastq.gz HG00258_1.fastq.gz +mv SRR099954_2.fastq.gz HG00258_2.fastq.gz + +# HG00265 +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099968/SRR099968_1.fastq.gz +wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR099/SRR099968/SRR099968_2.fastq.gz +mv SRR099968_1.fastq.gz HG00265_1.fastq.gz +mv SRR099968_1.fastq.gz HG00265_2.fastq.gz + +cd $CWD diff --git a/05-trimgalore.sh b/05-trimgalore.sh new file mode 100755 index 0000000..0fa9e1e --- /dev/null +++ b/05-trimgalore.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +HOME_PATH=/home/user/analysis +FASTQ_PATH=$HOME_PATH/fastq +TRIMGALORE_COMMAND=$TRIMGALORE_PATH/trim_galore +CUTADAPT_COMMAND=$CUTADAPT_PATH/cutadapt +TRIMGALORE_OUTPUT=$HOME_PATH/fastq_qual +CORES=4 + +if [ ! -d $TRIMGALORE_OUTPUT ] +then + mkdir -p $TRIMGALORE_OUTPUT +fi + +for FILE in $FASTQ_PATH/*_1.fastq.gz +do + BASE=`basename $FILE | sed s/_1\.fastq\.gz//` + echo "Processing $BASE" + mkdir -p $TRIMGALORE_OUTPUT + F1=$FASTQ_PATH/$BASE"_1.fastq.gz" + F2=$FASTQ_PATH/$BASE"_2.fastq.gz" + $TRIMGALORE_COMMAND \ + --quality 30 \ + --length 50 \ + --output_dir $TRIMGALORE_OUTPUT/$BASE \ + --path_to_cutadapt $CUTADAPT_COMMAND \ + --cores 4 \ + --paired \ + --fastqc \ + --trim-n $F1 $F2 + + mv $TRIMGALORE_OUTPUT/$BASE"_1_val_1.fq.gz" \ + $TRIMGALORE_OUTPUT/$BASE"_1.fastq.gz" + mv $TRIMGALORE_OUTPUT/$BASE"_2_val_2.fq.gz" \ + $TRIMGALORE_OUTPUT/$BASE"_2.fastq.gz" + mv $TRIMGALORE_OUTPUT/$BASE"_1_val_1_fastqc.html" \ + $TRIMGALORE_OUTPUT/$BASE"_1_fastqc.html" + mv $TRIMGALORE_OUTPUT/$BASE"_1_val_1_fastqc.zip" \ + $TRIMGALORE_OUTPUT/$BASE"_1_fastqc.zip" + mv $TRIMGALORE_OUTPUT/$BASE"_2_val_2_fastqc.html" \ + $TRIMGALORE_OUTPUT/$BASE"_2_fastqc.html" + mv $TRIMGALORE_OUTPUT/$BASE"_2_val_2_fastqc.zip" \ + $TRIMGALORE_OUTPUT/$BASE"_2_fastqc.zip" +done + +## For single-end reads +#for FILE in $FASTQ_PATH/*.fastq.gz +#do +# BASE=`basename $FILE | sed s/\.fastq\.gz//` +# echo "Processing $BASE" +# mkdir -p $TRIMGALORE_OUTPUT +# F=$FASTQ_PATH/$BASE".fastq.gz" +# $TRIMGALORE_COMMAND \ +# --quality 30 \ +# --length 50 \ +# --output_dir $TRIMGALORE_OUTPUT/$BASE \ +# --path_to_cutadapt $CUTADAPT_COMMAND \ +# --cores 4 \ +# --fastqc \ +# --trim-n $F +# +# mv $TRIMGALORE_OUTPUT/$BASE"_val.fq.gz" \ +# $TRIMGALORE_OUTPUT/$BASE".fastq.gz" +# mv $TRIMGALORE_OUTPUT/$BASE"_val_fastqc.html" \ +# $TRIMGALORE_OUTPUT/$BASE"_fastqc.html" +# mv $TRIMGALORE_OUTPUT/$BASE"_val_fastqc.zip" \ +# $TRIMGALORE_OUTPUT/$BASE"_fastqc.zip" +#done diff --git a/06-refindex.sh b/06-refindex.sh new file mode 100755 index 0000000..06e3da4 --- /dev/null +++ b/06-refindex.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +cd $RESOURCES_PATH/hs37d5 +$BWA_PATH/bwa index hs37d5.fa +$SAMTOOLS_PATH/samtools faidx hs37d5.fa +$SAMTOOLS_PATH/samtools dict hs37d5.fa > hs37d5.dict +cd $CWD diff --git a/07-alignment.sh b/07-alignment.sh new file mode 100755 index 0000000..f497ec4 --- /dev/null +++ b/07-alignment.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +HOME_PATH=/home/user/analysis +# Change the path below with the quality-controlled data directory +# if trimming performed (see commented line below) +FASTQ_PATH=$HOME_PATH/fastq +#FASTQ_PATH=$HOME_PATH/fastq_qual +BAM_PATH=$HOME_PATH/bam +THREADS=24 +BWA_INDEX=$RESOURCES_PATH/hs37d5/hs37d5.fa + +if [ -d $BAM_PATH ] +then + mkdir -p $BAM_PATH +fi + +for FILE in `ls $FASTQ_PATH/*_1.fastq.gz` +do + BASE=`basename $FILE | sed s/_1\.fastq\.gz//` + F1=$FASTQ_PATH/$BASE"_1.fastq.gz" + F2=$FASTQ_PATH/$BASE"_2.fastq.gz" + + RG="@RG\tID:"$BASE"\tSM:"$BASE"\tLB:WES\tPL:ILLUMINA" + + $BWA_PATH/bwa mem -t $THREADS -R $RG $BWA_INDEX $F1 $F2 | \ + $SAMTOOLS_PATH/samtools view -bS -o $BAM_PATH/$BASE".uns" - +done + +## For single-end reads +#for FILE in `ls $FASTQ_PATH/*.fastq.gz` +#do +# BASE=`basename $FILE | sed s/\.fastq\.gz//` +# F=$FASTQ_PATH/$BASE".fastq.gz" +# +# RG="@RG\tID:"$BASE"\tSM:"$BASE"\tLB:WES\tPL:ILLUMINA" +# +# $BWA_PATH/bwa mem -t $THREADS -R $RG $BWA_INDEX $F | \ +# $SAMTOOLS_PATH/samtools view -bS -o $BAM_PATH/$BASE".uns" - +#done diff --git a/09-bamstats.sh b/09-bamstats.sh new file mode 100755 index 0000000..7306ac0 --- /dev/null +++ b/09-bamstats.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +CAPTURE_KIT=$HOME_PATH/resources/panel/Agilent_SureSelect_All_Exon_V2.bed +BAM_PATH=$HOME_PATH/bam +REPORT=$HOME_PATH/reports/finalbamstats.txt +mkdir $HOME_PATH/reports + +printf "%s\t%s\t%s\t%s\t%s\t%s%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "name" \ + "total reads" "total reads pairs" "aligned reads" \ + "properly paired aligned pairs" "uniquely aligned reads (q>20)" \ + "properly paired uniquely aligned reads" "chimeric reads" \ + "reads overlapping targets" "total bases" "aligned bases" \ + "uniquely aligned bases" "bases overlapping targets" > $REPORT + +for FILE in `ls $BAM_PATH/*_fixmate.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + echo "Processing $SAMPLE" + + BAM=$BAM_PATH/$SAMPLE".bam" + + printf "%s\t" $SAMPLE >> $REPORT + + echo " total reads" + printf "%d\t" `$SAMTOOLS_PATH/samtools view -c -F2048 $BAM` >> $REPORT + + echo " total read pairs" + printf "%d\t" `$SAMTOOLS_PATH/samtools view -c -F2048 $BAM | awk '{print $1/2}'` \ + >> $REPORT + + echo " aligned reads" + printf "%d\t" `$SAMTOOLS_PATH/samtools view -c -F2052 $BAM` >> $REPORT + + echo " properly paired aligned pairs" + printf "%d\t" `$SAMTOOLS_PATH/samtools view -c -f66 -F2048 $BAM` \ + >> $REPORT + + echo " uniquely aligned reads (q>20)" + printf "%d\t" `$SAMTOOLS_PATH/samtools view -c -F2052 -q20 $BAM` >> \ + $REPORT + + echo " properly paired uniquely aligned reads" + printf "%d\t" `$SAMTOOLS_PATH/samtools view -c -f66 -F2048 -q20 $BAM` \ + >> $REPORT + + echo " chimeric reads" + printf "%d\t" ` + $SAMTOOLS_PATH/samtools flagstat $BAM | \ + perl -e 'my @in;' \ + -e 'while(<>) { chomp $_; push(@in,$_); }' \ + -e 'my @tmp = split("\\\+",pop(@in));' \ + -e '$tmp[0] =~ s/\s+$//;' \ + -e 'print STDOUT $tmp[0];' + ` >> $REPORT + + echo " reads overlapping targets" + printf "%d\t" ` + $BEDTOOLS_PATH/bedtools intersect -a $CAPTURE_KIT -b $BAM -c | \ + awk 'BEGIN {tot=0}{tot+=$4} END {print tot}' + ` >> $REPORT + + echo " total bases" + printf "%d\t" ` + $SAMTOOLS_PATH/samtools view $BAM | cut -f10 | \ + awk 'BEGIN {tr=0}{tr+=length($0)} END {print tr}' + ` >> $REPORT + + echo " aligned bases" + printf "%d\t" ` + $SAMTOOLS_PATH/samtools view -F2052 $BAM | cut -f10 | \ + awk 'BEGIN {tr=0}{tr+=length($0)} END {print tr}' + ` >> $REPORT + + echo " uniquely aligned bases" + printf "%d\t" ` + $SAMTOOLS_PATH/samtools view -F2052 -q20 $BAM | cut -f10 | \ + awk 'BEGIN {tr=0}{tr+=length($0)} END {print tr}' + ` >> $REPORT + + echo " bases overlapping targets" + printf "%d\n" ` + $BEDTOOLS_PATH/bedtools coverage -a $CAPTURE_KIT -b $BAM -d | \ + awk 'BEGIN {tr=0} {tr+=$5} END {print tr}' + ` >> $REPORT + +done diff --git a/10-signal.sh b/10-signal.sh new file mode 100755 index 0000000..5f741d2 --- /dev/null +++ b/10-signal.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +BAM_PATH=$HOME_PATH/bam +TRACKS_PATH=$HOME_PATH/tracks +GENOME_SIZE=$BEDTOOLS_PATH/../genomes/human.hg19.genome + +if [ -d $TRACKS_PATH ] +then + mkdir -p $TRACKS_PATH +fi + +for FILE in `ls $BAM_PATH/*_fixmate.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + echo "Processing $SAMPLE" + $BEDTOOLS_PATH/bedtools genomecov -bg \ + -ibam $BAM_PATH/$SAMPLE/$SAMPLE".bam" | \ + grep -vP 'chrU|rand|hap|loc|cox|GL|NC|hs37d5' | \ + awk '{print "chr"$1"\t"$2"\t"$3"\t"$4}' | \ + sed s/chrMT/chrM/g | \ + sort -k1,1 -k2g,2 > $TRACKS_PATH/$SAMPLE".bedGraph" & +done + +wait + +for FILE in `ls $TRACKS_PATH/*.bedGraph` +do + echo "Processing $FILE" + SAMPLE=`basename $FILE | sed s/\.bedGraph//` + $UCSCTOOLS_PATH/bedGraphToBigWig $FILE $GENOME_SIZE $TRACKS_PATH/$SAMPLE".bigWig" & +done + +wait + +rm $TRACKS_PATH/*.bedGraph diff --git a/11-bqsr.sh b/11-bqsr.sh new file mode 100755 index 0000000..0f2be11 --- /dev/null +++ b/11-bqsr.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +BAM_PATH=$HOME_PATH/bam +CAPTURE_KIT=$RESOURCES_PATH/panel/Agilent_SureSelect_All_Exon_V2.bed +INTERVAL_LIST_PATH=$HOME_PATH/resources/interval_scatter +BWA_INDEX=$RESOURCES_PATH/hs37d5/hs37d5.fa +DBSNP=$RESOURCES_PATH/dbSNP151.vcf +GNOMAD=$RESOURCES_PATH/gnomad.exomes.r2.1.1.sites.vcf.bgz +CORES=16 +PADDING=50 + +# Process dbSNP +$HTSLIB_PATH/bgzip $DBSNP +$HTSLIB_PATH/tabix $DBSNP”.gz” +DBSNP=$RESOURCES_PATH/dbSNP151.vcf.gz + +mkdir -p $HOME_PATH/reports +META_REPORT=$HOME_PATH/reports/bsqr_current.log + +echo "=== Splitting intervals" > $META_REPORT +if [ -d $INTERVAL_LIST_PATH ] +then + echo " Cleaning previous intervals" >> $META_REPORT + rm -r $INTERVAL_LIST_PATH +fi +mkdir -p $INTERVAL_LIST_PATH + +# Firstly split exome intervals for parallel BSQR + +$GATK_PATH/gatk SplitIntervals \ + --reference $BWA_INDEX \ + --intervals $CAPTURE_KIT \ + --interval-padding $PADDING \ + --scatter-count $CORES \ + --output $INTERVAL_LIST_PATH \ + --QUIET + +echo "=== Calculating BQSR tables" >> $META_REPORT +for FILE in `ls $BAM_PATH/*_fixmate.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + echo "Processing $SAMPLE" >> $META_REPORT + + BAM=$BAM_PATH/$SAMPLE/$SAMPLE".bam" + mkdir -p $BAM_PATH/$SAMPLE + BQSR_PART_OUT=$BAM_PATH/$SAMPLE/bqsr_parts + + if [ -d $BQSR_PART_OUT ] + then + echo " Cleaning previous tables" >> $META_REPORT + rm -r $BQSR_PART_OUT + fi + mkdir -p $BQSR_PART_OUT + + # Calculate BQSR over intervals + for INTERVAL in `readlink -f $INTERVAL_LIST_PATH/*` + do + BQSR_NAME=`basename $INTERVAL | sed s/\-scattered\.interval_list//` + echo " Processing $BQSR_NAME" >> $META_REPORT + $GATK_PATH/gatk BaseRecalibrator \ + --input $BAM \ + --reference $BWA_INDEX \ + --output $BQSR_PART_OUT/$BQSR_NAME".tab" \ + --known-sites $DBSNP \ + --known-sites $GNOMAD \ + --intervals $INTERVAL \ + --interval-padding $PADDING \ + --QUIET & + done + + # Wait for individuals to complete before moving to the next thread + wait +done + +echo "=== Gathering BQSR reports" >> $META_REPORT +for FILE in `ls $BAM_PATH/*_fixmate.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + echo "Processing reports for $SAMPLE" >> $META_REPORT + + BQSR_PART_OUT=$BAM_PATH/$SAMPLE/bqsr_parts + + for TAB in `readlink -f $BQSR_PART_OUT/*` + do + echo "--input $TAB" >> $BAM_PATH/$SAMPLE/gather_bqsr.arg + done + + # Gather reports + $GATK_PATH/gatk GatherBQSRReports \ + --arguments_file $BAM_PATH/$SAMPLE/gather_bqsr.arg \ + --output $BAM_PATH/$SAMPLE/bqsr.tab \ + --QUIET & +done + +# Wait for BQSR tables to be merged for each sample +wait + +echo "=== Applying BQSR to BAM files" >> $META_REPORT +for FILE in `ls $BAM_PATH` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + echo "Processing BAM file $SAMPLE" >> $META_REPORT + + BAM=$BAM_PATH/$SAMPLE/$SAMPLE".bam" + BQSR_TABLE=$BAM_PATH/$SAMPLE/bqsr.tab + + # Apply BQSR to BAM files + $GATK_PATH/gatk ApplyBQSR \ + --input $BAM \ + --reference $BWA_INDEX \ + --bqsr-recal-file $BQSR_TABLE \ + --output $BAM_PATH/$SAMPLE/$SAMPLE"_bqsr.bam" \ + --QUIET & +done + +# Wait for new BAM files to be created before reporting finished +wait + +echo "=== Finished!" >> $META_REPORT diff --git a/12-gatk.sh b/12-gatk.sh new file mode 100755 index 0000000..dfa8dd3 --- /dev/null +++ b/12-gatk.sh @@ -0,0 +1,176 @@ +#!/bin/bash + +export VCF_PATH=$HOME_PATH/vcf +BAM_PATH=$HOME_PATH/bam +INTERVAL_LIST_PATH=$RESOURCES_PATH/panel/interval_scatter +BWA_INDEX=$RESOURCES_PATH/hs37d5/hs37d5.fa +CORES=16 +PADDING=50 + +META_REPORT=$HOME_PATH/reports/haca_current.log + +echo "=== Calling variants" > $META_REPORT +for SAMPLE in `ls $BAM_PATH` +do + echo "Processing $SAMPLE" >> $META_REPORT + + BAM=$BAM_PATH/$SAMPLE/$SAMPLE"_bqsr.bam" + GVCF_PART_OUT=$BAM_PATH/$SAMPLE/gvcf_parts + + if [ -d $GVCF_PART_OUT ] + then + echo " Cleaning previous gVCFs" >> $META_REPORT + rm -r $GVCF_PART_OUT + fi + mkdir -p $GVCF_PART_OUT + + # Call variants over intervals + for INTERVAL in `readlink -f $INTERVAL_LIST_PATH/*` + do + GVCF_NAME=`basename $INTERVAL | sed s/\-scattered\.interval_list//` + echo " Processing $GVCF_NAME" >> $META_REPORT + $GATK_PATH/gatk HaplotypeCaller \ + --input $BAM \ + --reference $BWA_INDEX \ + --intervals $INTERVAL \ + --interval-padding $PADDING \ + --output $GVCF_PART_OUT/$GVCF_NAME".g.vcf" \ + --emit-ref-confidence GVCF \ + --create-output-variant-index false \ + --QUIET & + done + + # Wait for individuals to complete before moving to the next thread + wait +done + +# Then GVCFs must be consolidated +echo "=== Merging gVCFs" >> $META_REPORT +for SAMPLE in `ls $BAM_PATH` +do + echo "Processing interval gVCFs for $SAMPLE" + + GVCF_PART_OUT=$BAM_PATH/$SAMPLE/gvcf_parts + + if [ -f $BAM_PATH/$SAMPLE/interval_gvcfs.txt ] + then + echo " Cleaning previous gVCFs input file" >> $META_REPORT + rm $BAM_PATH/$SAMPLE/interval_gvcfs.txt + fi + for GVCF in `readlink -f $GVCF_PART_OUT/*.g.vcf` + do + echo "$GVCF" >> $BAM_PATH/$SAMPLE/interval_gvcfs.txt + done + + # Get the gVCF header and strip the GATK command + GVFH=`readlink -f $GVCF_PART_OUT/*.g.vcf | head -1` + grep "^#" $GVFH | grep -v "^##GATKCommand" > $BAM_PATH/$SAMPLE/gvcf.header + + # Cat the gVCFs + for GVCF in `readlink -f $GVCF_PART_OUT/*.g.vcf` + do + echo " Concatenating $GVCF" + #echo " Concatenating $GVCF" >> $META_REPORT + grep -v "^#" $GVCF >> $BAM_PATH/$SAMPLE/gvcf.tmp + done + + # Place the header + echo " Creating final gVCF" + #echo " Creating final gVCF" >> $META_REPORT + cat $BAM_PATH/$SAMPLE/gvcf.header $BAM_PATH/$SAMPLE/gvcf.tmp > \ + $BAM_PATH/$SAMPLE/$SAMPLE".u.g.vcf" + + rm $BAM_PATH/$SAMPLE/gvcf.tmp $BAM_PATH/$SAMPLE/gvcf.header +done + +# Sort gVCFs +echo "=== Sorting gVCFs" >> $META_REPORT +for SAMPLE in `ls $BAM_PATH` +do + echo "Sorting gVCF for $SAMPLE" >> $META_REPORT + + $GATK_PATH/gatk SortVcf \ + --INPUT $BAM_PATH/$SAMPLE/$SAMPLE".u.g.vcf" \ + --OUTPUT $BAM_PATH/$SAMPLE/$SAMPLE".g.vcf.gz" \ + --QUIET & +done + +# Wait for sorting to finish before cleaning unsorted +wait + +# Some cleanup +echo "=== Deleting unsorted gVCFs" >> $META_REPORT +for FILE in `ls $BAM_PATH/*_fixmate.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + echo "Deleting unsorted gVCF for $SAMPLE" >> $META_REPORT + rm $BAM_PATH/$SAMPLE/$SAMPLE".u.g.vcf" + echo "Compression gVCF parts for $SAMPLE" >> $META_REPORT + pigz $BAM_PATH/$SAMPLE/gvcf_parts/* + echo "Compression BQSR reports for $SAMPLE" >> $META_REPORT + pigz $BAM_PATH/$SAMPLE/bqsr_parts/* +done + +# Gather VCFs +echo "=== Combining sorted population gVCFs" >> $META_REPORT +if [ ! -d $VCF_PATH ] +then + mkdir $VCF_PATH +fi +# Delete the .arg file as it will get multiple entries +if [ -f $VCF_PATH/combine_gvcf.arg ] +then + rm $VCF_PATH/combine_gvcf.arg +fi + +for FILE in `ls $BAM_PATH/*_fixmate.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + GVCF=`readlink -f $BAM_PATH/$SAMPLE/$SAMPLE".g.vcf.gz"` + echo "--variant $GVCF" >> $VCF_PATH/combine_gvcf.arg +done + +# Combine gVCFs +$GATK_PATH/gatk CombineGVCFs \ + --reference $BWA_INDEX \ + --arguments_file $VCF_PATH/combine_gvcf.arg \ + --output $VCF_PATH/haplotypecaller_full.g.vcf.gz + +# Genotype VCFs +echo "=== Genotyping gVCFs" >> $META_REPORT +$GATK_PATH/gatk GenotypeGVCFs \ + --reference $BWA_INDEX \ + --variant $VCF_PATH/haplotypecaller_full.g.vcf.gz \ + --output $VCF_PATH/haplotypecaller_full.vcf.gz + +# Apply basic GATK hard filters +echo "=== Applying GATK hard filters" >> $META_REPORT +$BCFTOOLS_PATH/bcftools view \ + --include 'QUAL>20 & INFO/QD>2 & INFO/MQ>40 & INFO/FS<60 & INFO/SOR<3 & INFO/MQRankSum>-12.5 & INFO/ReadPosRankSum>-8 & TYPE="snp"' \ + --output-type z \ + --output-file $VCF_PATH/haplotypecaller_filtered_snp.vcf.gz \ + $VCF_PATH/haplotypecaller_full.vcf.gz & + +# The normalization step is potentially not required but it is harmless +$BCFTOOLS_PATH/bcftools view \ + --include 'QUAL>20 & INFO/QD>2 & INFO/ReadPosRankSum>-20 & INFO/InbreedingCoeff>-0.8 & INFO/FS<200 & INFO/SOR<10 & TYPE~"indel"' \ + $VCF_PATH/haplotypecaller_full.vcf.gz | \ + $BCFTOOLS_PATH/bcftools norm \ + --fasta-ref $BWA_INDEX \ + --output-type z \ + --output $VCF_PATH/haplotypecaller_filtered_norm_indel.vcf.gz & +wait + +echo "=== Merging GATK filtered SNPs and INDELs" >> $META_REPORT +$GATK_PATH/gatk MergeVcfs \ + --INPUT $VCF_PATH/haplotypecaller_filtered_snp.vcf.gz \ + --INPUT $VCF_PATH/haplotypecaller_filtered_norm_indel.vcf.gz \ + --OUTPUT $VCF_PATH/haplotypecaller_filtered_norm.vcf.gz \ + --QUIET + +rm $VCF_PATH/haplotypecaller_filtered_snp.vcf.gz \ + $VCF_PATH/haplotypecaller_filtered_norm_indel.vcf.gz + +#echo "=== Finished!" +echo "=== Finished!" >> $META_REPORT + diff --git a/13-freebayes.sh b/13-freebayes.sh new file mode 100755 index 0000000..7bba6af --- /dev/null +++ b/13-freebayes.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +export VCF_PATH=$HOME_PATH/vcf +BAM_PATH=$HOME_PATH/bam +CAPTURE_KIT=$RESOURCES_PATH/panel/Agilent_SureSelect_All_Exon_V2.bed +INTERVAL_LIST_PATH=$RESOURCES_PATH/resources/interval_scatter_bed +BWA_INDEX=$RESOURCES_PATH/hs37d5/hs37d5.fa +CORES=32 +PADDING=50 + +META_REPORT=$HOME_PATH/reports/freebayes_current.log + +echo "=== Splitting intervals" > $META_REPORT +if [ -d $INTERVAL_LIST_PATH ] +then + echo " Cleaning previous intervals" >> $META_REPORT + rm -r $INTERVAL_LIST_PATH +fi +mkdir -p $INTERVAL_LIST_PATH + +# Firstly split exome intervals for parallel freebayes + +$GATK_PATH/gatk SplitIntervals \ + --reference $BWA_INDEX \ + --intervals $CAPTURE_KIT \ + --interval-padding $PADDING \ + --scatter-count $CORES \ + --extension .pre \ + --output $INTERVAL_LIST_PATH \ + --QUIET + +echo "=== Converting intervals" >> $META_REPORT +for INTERVAL in `ls $INTERVAL_LIST_PATH` +do + BED=`basename $INTERVAL | sed s/\.pre//` + INTERVAL_FILE=$INTERVAL_LIST_PATH/$INTERVAL + grep -vP "^@" $INTERVAL_FILE | awk '{print $1"\t"$2"\t"$3}' > \ + $INTERVAL_LIST_PATH/$BED".bed" & +done + +# Wait and clear intermediate intervals +wait +rm $INTERVAL_LIST_PATH/*.pre + +# Prepare BAM file list for freebayes +echo "=== Preparing BAM file list" >> $META_REPORT +BAMLIST=/media/raid/tmp/tmp/medex/scripts/bamlist.txt +if [ -f $BAMLIST ] +then + rm $BAMLIST +fi +for FILE in `ls $BAM_PATH/*_fixmate\.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + BAM=$BAM_PATH/$SAMPLE/$SAMPLE".bam" + echo "$BAM" >> $BAMLIST +done + +echo "=== Calling variants with FreeBayes" >> $META_REPORT +if [ -d $VCF_PATH/fb_parts ] +then + rm -r $VCF_PATH/fb_parts +fi +mkdir -p $VCF_PATH/fb_parts + +for TARGET in `ls $INTERVAL_LIST_PATH` +do + NAME=`basename $TARGET | sed s/\.bed//` + echo "Processing interval list $NAME" >> $META_REPORT + INTERVAL=$INTERVAL_LIST_PATH/$TARGET + $FREEBAYES_PATH/freebayes \ + --fasta-reference $BWA_INDEX \ + --bam-list $BAMLIST \ + --targets $INTERVAL \ + --vcf $VCF_PATH/fb_parts/$NAME".vcf" & +done + +# Wait before gathering the results +wait +echo "=== Merging VCFs" >> $META_REPORT +cat $VCF_PATH/*.vcf | \ + $VCFLIB_PATH/scripts/vcffirstheader | \ + $VCFLIB_PATH/bin/vcfstreamsort -w 1000 | \ + $VCFLIB_PATH/bin/vcfuniq > \ + $VCF_PATH/all_samples_freebayes.vcf + +echo "=== Compressing and indexing final VCF" >> $META_REPORT +$HTSLIB_PATH/bgzip $VCF_PATH/freebayes_full.vcf +$HTSLIB_PATH/tabix $VCF_PATH/freebayes_full.vcf.gz + +### Basic filtering before decomposing and normalization + +# Determine a quality and depth cutoff pre-filter based on 99th percentile of +# the respective distributions +echo "=== Determining QUAL and DP hard pre-filters" >> $META_REPORT +$BCFTOOLS_PATH/bcftools query \ + --include 'QUAL>20' \ + --format '%QUAL\n' $VCF_PATH/freebayes_full.vcf.gz > quals.tmp & +$BCFTOOLS_PATH/bcftools query \ + --include 'QUAL>20' \ + --format '%INFO/DP\n' $VCF_PATH/freebayes_full.vcf.gz | \ + awk -F "," '{print $1}' > $VCF_PATH/dps.tmp & +wait + +Rscript -e ' + vp <- Sys.getenv("VCF_PATH") + dps <- as.numeric(readLines(file.path(vp,"dps.tmp"))); + quals <- as.numeric(readLines(file.path(vp,"quals.tmp"))); + qudp <- unname(round(quantile(dps,0.99))); + ququ <- unname(quantile(quals,0.99)); + write(qudp,file.path(vp,"dpt.tmp")); + write(ququ,file.path(vp,"qut.tmp")); +' +QUALUP=`cat $VCF_PATH/qut.tmp` +DPUP=`cat $VCF_PATH/dpt.tmp` +rm $VCF_PATH/qut.tmp $VCF_PATH/dpt.tmp $VCF_PATH/dps.tmp $VCF_PATH/quals.tmp + +# Apply the filters, decompose complex variants and normalize +echo "=== Applying filters and normalizing" >> $META_REPORT +$BCFTOOLS_PATH/bcftools view \ + --include 'QUAL>20 & INFO/DP>10 & QUAL<'$QUALUP' & INFO/DP<'$DPUP' & (QUAL/(INFO/DP))>2' $VCF_PATH/freebayes_full.vcf.gz | \ + $VCFLIB_PATH/bin/vcfallelicprimitives -kg | \ + $BCFTOOLS_PATH/bcftools norm \ + --fasta-ref $BWA_INDEX \ + --output-type z \ + --output $VCF_PATH/freebayes_filtered_norm.vcf.gz +$HTSLIB_PATH/tabix $VCF_PATH/freebayes_filtered_norm.vcf.gz + +echo "=== Finished!" >> $META_REPORT diff --git a/14-deepvariant.sh b/14-deepvariant.sh new file mode 100755 index 0000000..e0c44de --- /dev/null +++ b/14-deepvariant.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +export VCF_PATH=$HOME_PATH/vcf +BAM_PATH=$HOME_PATH/bam +CAPTURE_KIT_DIR=$RESOURCES_PATH/resources/panel +CAPTURE_KIT=$RESOURCES_PATH/panel/Agilent_SureSelect_All_Exon_V2.bed +DV_VERSION=0.9.0 +BWA_INDEX_DIR=$RESOURCES_PATH/hs37d5 +BWA_INDEX=$RESOURCES_PATH/hs37d5/hs37d5.fa +CORES=32 + +META_REPORT=$HOME_PATH/reports/deepvariant_current.log + +echo "=== Calling variants" > $META_REPORT +for FILE in `ls $BAM_PATH/*_fixmate.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + echo "Processing $SAMPLE" >> $META_REPORT + + BAM=$BAM_PATH/$SAMPLE".bam" + + docker run \ + -v "$BAM_PATH":"/data" \ + -v "$BWA_INDEX_DIR":"/reference" \ + -v "$CAPTURE_KIT_DIR":"/capture_kit" \ + google/deepvariant:$DV_VERSION \ + /opt/deepvariant/bin/run_deepvariant \ + --model_type=WES \ + --ref="/reference/hs37d5.fa" \ + --reads="/data/$SAMPLE.bam" \ + --regions="/capture_kit/Agilent_SureSelect_All_Exon_V2.bed" \ + --output_vcf="/data/$SAMPLE/$SAMPLE'_DV.vcf'" \ + --output_gvcf="/data/$SAMPLE/$SAMPLE'_DV.g.vcf'" \ + --num_shards=$CORES + +done + +echo "=== Creating list of gVCF files" >> $META_REPORT +for FILE in `ls $BAM_PATH/*_fixmate.bam` +do + SAMPLE=`basename $FILE | sed s/_fixmate\.bam//` + GVCF=`readlink -f $BAM_PATH/$SAMPLE/$SAMPLE"_DV.g.vcf"` + echo "$GVCF" >> $VCF_PATH/deepvariant_gvcf_list.txt +done + +echo "=== Gathering gVCFs" >> $META_REPORT +rm -r GLnexus.DB +$GLNEXUS_PATH/glnexus_cli \ + --config DeepVariantWES \ + --bed $CAPTURE_KIT \ + --list $VCF_PATH/deepvariant_gvcf_list.txt \ + --threads $CORES | \ + $BCFTOOLS_PATH/bcftools view --include 'QUAL>=20' - | \ + $BCFTOOLS_PATH/bcftools norm \ + --fasta-ref $BWA_INDEX \ + --output-type z \ + --output $VCF_PATH/deepvariant_filtered_norm.vcf.gz +$HTSLIB_PATH/tabix $VCF_PATH/deepvariant_filtered_norm.vcf.gz + +echo "=== Finished!" >> $META_REPORT diff --git a/15-annotation.sh b/15-annotation.sh new file mode 100755 index 0000000..9218435 --- /dev/null +++ b/15-annotation.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +export VCF_PATH=$HOME_PATH/vcf +DBSNP_FILE=$RESOURCES_PATH/dbSNP/dbSNP151.vcf.gz +DBNSFP_FILE=$RESOURCES_PATH/dbNSFP/dbNSFP2.9.3.txt.gz +GNOMAD_FILE=$RESOURCES_PATH/gnomAD/gnomad.exomes.r2.1.1.sites.vcf.bgz + +if [ ! -d $SNPEFF_PATH/data ] +then + java -jar $SNPEFF_PATH/snpEff.jar download GRCh37.75 +fi + +## Haplotype Caller +# Variant effect annotation +java -Xmx4096m -jar $SNPEFF_PATH/snpEff.jar ann \ + -v -noLog -noStats -noLof GRCh37.75 \ + $VCF_PATH/haplotypecaller_filtered_norm.vcf.gz > $VCF_PATH/haplotypecaller_filtered_norm_eff.vcf +$HTSLIB_PATH/bgzip $VCF_PATH/haplotypecaller_filtered_norm_eff.vcf + +$HTSLIB_PATH/tabix $VCF_PATH/haplotypecaller_filtered_norm_eff.vcf.gz + +# Annotation with dbSNP +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar annotate \ + -v -id $DBSNP_FILE \ + $VCF_PATH/haplotypecaller_filtered_norm_eff.vcf.gz > $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp.vcf + $HTSLIB_PATH/bgzip +$VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp.vcf.gz +$HTSLIB_PATH/tabix $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp.vcf.gz + +# Annotation with dbNSFP +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar dbnsfp \ + -v -m -db $DBNSFP_FILE \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp.vcf.gz > $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf + $HTSLIB_PATH/bgzip $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf +$HTSLIB_PATH/tabix $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# Annotation with gnomAD +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar annotate \ + -v $GNOMAD_FILE \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz > $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf + $HTSLIB_PATH/bgzip $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf +$HTSLIB_PATH/tabix $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf.gz + +# FreeBayes +# Variant effect annotation +java -Xmx4096m -jar $SNPEFF_PATH/snpEff.jar ann \ + -v -noLog -noStats -noLof GRCh37.75 \ + $VCF_PATH/freebayes_filtered_norm.vcf.gz > $VCF_PATH/freebayes_filtered_norm_eff.vcf + $HTSLIB_PATH/bgzip $VCF_PATH/freebayes_filtered_norm_eff.vcf + $HTSLIB_PATH/tabix \ + $VCF_PATH/freebayes_filtered_norm_eff.vcf.gz + +# Annotation with dbSNP +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar annotate \ + -v -id $DBSNP_FILE \ + $VCF_PATH/freebayes_filtered_norm_eff.vcf.gz > $VCF_PATH/freebayes_filtered_norm_eff_dbsnp.vcf + $HTSLIB_PATH/bgzip\ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp.vcf.gz + + $HTSLIB_PATH/tabix \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp.vcf.gz + +# Annotation with dbNSFP +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar dbnsfp \ + -v -m -db $DBNSFP_FILE \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp.vcf.gz > $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf + $HTSLIB_PATH/bgzip\ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf \ + $HTSLIB_PATH/tabix \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# Annotation with gnomAD +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar annotate \ + -v $GNOMAD_FILE \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz > $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf + $HTSLIB_PATH/bgzip\ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf + $HTSLIB_PATH/tabix \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf.gz + +## Deep Variant +# Variant effect annotation +java -Xmx4096m -jar $SNPEFF_PATH/snpEff.jar ann \ + -v -noLog -noStats -noLof GRCh37.75 \ + $VCF_PATH/deepvariant_filtered_norm.vcf.gz > $VCF_PATH/deepvariant_filtered_norm_eff.vcf.gz + $HTSLIB_PATH/bgzip \ + $VCF_PATH/deepvariant_filtered_norm_eff.vcf + $HTSLIB_PATH/tabix \ + $VCF_PATH/deepvariant_filtered_norm_eff.vcf.gz + +# Annotation with dbSNP +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar annotate \ + -v -id $DBSNP_FILE \ + $VCF_PATH/deepvariant_filtered_norm_eff.vcf.gz > $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp.vcf + $HTSLIB_PATH/bgzip\ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp.vcf + $HTSLIB_PATH/tabix \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp.vcf.gz + +# Annotation with dbNSFP +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar dbnsfp \ + -v -m -db $DBNSFP_FILE \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp.vcf.gz > $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf + $HTSLIB_PATH/bgzip\ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + $HTSLIB_PATH/tabix \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# Annotation with gnomAD +java -Xmx4096m -jar $SNPEFF_PATH/SnpSift.jar annotate \ + -v $GNOMAD_FILE \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz > $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf.gz + $HTSLIB_PATH/bgzip\ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf + $HTSLIB_PATH/tabix \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp_gnomad.vcf.gz + +# Remove intermediate files +rm $VCF_PATH/haplotypecaller_filtered_norm_eff.vcf.gz \ + $VCF_PATH/haplotypecaller_filtered_norm_eff.vcf.gz.tbi \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp.vcf.gz \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp.vcf.gz.tbi \ + $VCF_PATH/freebayes_filtered_norm_eff.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff.vcf.gz.tbi \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp.vcf.gz.tbi \ + $VCF_PATH/deepvariant_filtered_norm_eff.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff.vcf.gz.tbi \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp.vcf.gz.tbi diff --git a/16-consolidation.sh b/16-consolidation.sh new file mode 100755 index 0000000..8c7cd4b --- /dev/null +++ b/16-consolidation.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +export VCF_PATH=$HOME_PATH/vcf + +# 1 +$BCFTOOLS_PATH/bcftools isec \ + --prefix 1 \ + --output-type z \ + --nfiles ~100 \ + --collapse none \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# 2 +$BCFTOOLS_PATH/bcftools isec \ + --prefix 2 \ + --output-type z \ + --nfiles ~010 \ + --collapse none \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# 3 +$BCFTOOLS_PATH/bcftools isec \ + --prefix 3 \ + --output-type z \ + --nfiles ~001 \ + --collapse none \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# 4 +$BCFTOOLS_PATH/bcftools isec \ + --prefix 4 \ + --output-type z \ + --nfiles ~110 \ + --collapse none \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# 5 +$BCFTOOLS_PATH/bcftools isec \ + --prefix 5 \ + --output-type z \ + --nfiles ~011 \ + --collapse none \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# 6 +$BCFTOOLS_PATH/bcftools isec \ + --prefix 6 \ + --output-type z \ + --nfiles ~101 \ + --collapse none \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz + +# 7 +$BCFTOOLS_PATH/bcftools isec \ + --prefix 7 \ + --output-type z \ + --nfiles ~111 \ + --collapse none \ + $VCF_PATH/haplotypecaller_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/freebayes_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz \ + $VCF_PATH/deepvariant_filtered_norm_eff_dbsnp_dbnsfp.vcf.gz