-
Notifications
You must be signed in to change notification settings - Fork 3
/
SARS-CoV-2-merge-variants-nolimit.sh
112 lines (97 loc) · 3.83 KB
/
SARS-CoV-2-merge-variants-nolimit.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
set -e
{
usage="$(basename "$0") [-h] [-g <reference_genome.fasta>]
This script will merge variants using jacquard and will calculate viral frequencies.
Arguments:
-h show this help text
-g PATH where SARS-CoV-2 reference genome (in fasta format) is located. If the genome is located in the working folder, just specify the name."
options=':hg:'
while getopts $options option; do
case "$option" in
h) echo "$usage"; exit;;
g) g=$OPTARG;;
:) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
\?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
esac
done
# mandatory arguments
if [ ! "$g" ]; then
echo "argument -g must be provided"
echo "$usage" >&2; exit 1
fi
begin=`date +%s`
### Merge VCFs using jacquard
echo "Merge VCFs using jacquard"
echo ""
jacquard merge --include_all ./ merged.GISAID.vcf
echo ""
# Left only genotypes in merged VCF
echo "Fixing genotypes in merged VCF"
echo ""
vcfkeepgeno merged.GISAID.vcf GT > merged.GISAID.GT.vcf
# Split variants and header from merged.GT.vcf
echo "Splitting variants and header from merged.GT.vcf"
echo ""
grep "#" merged.GISAID.GT.vcf > header
grep -v "#" merged.GISAID.GT.vcf > variants.vcf
sed -i 's|1/1|1|'g variants.vcf
sed -i 's|0/1|1|'g variants.vcf
sed -i 's|1/0|1|'g variants.vcf
sed -i 's/[.]/0/'g variants.vcf # convert point to zeros
# Reconstitute vcf file
echo "Reconstituting vcf file"
echo ""
cat header variants.vcf > merged.GISAID.fixed.vcf
rm header variants.vcf
sed -i 's/NC_04551202/NC_045512.2/'g merged.GISAID.fixed.vcf
# left-align vcf file and fixing names
echo "left-aligning vcf file and fixing names"
echo ""
vcfleftalign -r ${g} merged.GISAID.fixed.vcf > merged.GISAID.left.vcf
sed -i 's/|unknown//'g merged.GISAID.left.vcf
# Merge VCFs using vcfcombine, see combined_sites.raw.vcf file
echo "Merge VCFs using vcfcombine, see combined_sites.raw.vcf file"
vcfcombine EPI*.vcf > combined_sites.raw.vcf
echo ""
# calculate AF
echo "calculating viral frequency with vcflib"
echo ""
vcffixup merged.GISAID.left.vcf > merged.GISAID.AF.raw.vcf
wget https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/master/problematic_sites_sarsCov2.vcf
sed -i 's/MN908947.3/NC_045512.2/'g problematic_sites_sarsCov2.vcf
vcfintersect -i problematic_sites_sarsCov2.vcf merged.GISAID.AF.raw.vcf -r ${g} --invert > merged.GISAID.AF.vcf
vcfintersect -i problematic_sites_sarsCov2.vcf combined_sites.raw.vcf -r ${g} --invert > combined_sites.vcf
rm merged.GISAID.fixed.vcf merged.GISAID.left.vcf
gzip merged.GISAID.vcf merged.GISAID.AF.raw.vcf combined_sites.raw.vcf
# Filter variants by Viral Frequency: 0.0099 (1%)
echo "Filtering variants by Viral Frequency: 0.0099 (1%)"
echo ""
vcffilter -f "AF > 0.0099" merged.GISAID.AF.vcf > merged.GISAID.AF_1%.vcf
grep -v "##" merged.GISAID.AF_1%.vcf > merged.GISAID.AF_1%.table
echo "#######"
echo "Summary:"
echo "#######"
echo ""
echo "merged.GISAID.AF.raw.vcf.gz contain all merged variants as genotype array, including all samples"
echo ""
echo "combined_sites.raw.vcf.gz contain all merged variants as list"
echo ""
echo "merged.GISAID.AF.vcf and combined_sites.vcf contain merged variants, problematic sites excluded. See https://virological.org/t/issues-with-sars-cov-2-sequencing-data/473."
echo ""
echo "merged.GISAID.AF_1%.vcf contain merged variants with viral Frequency >=1%."
echo ""
echo "merged.GISAID.AF_1%.table contain merged variants (Viral Frequency >=1%), without VCF header, suitable for plotting"
echo ""
echo "All done."
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
mdate=`date +'%d/%m/%Y %H:%M:%S'`
mcpu=$[100-$(vmstat 1 2|tail -1|awk '{print $15}')]%
mmem=`free | grep Mem | awk '{print $3/$2 * 100.0}'`
echo "$mdate | $mcpu | $mmem" >> ./stats-cpu
###############################################################
#
} | tee logfile
#