-
Notifications
You must be signed in to change notification settings - Fork 1
/
LowCovGene_identification.sh
276 lines (245 loc) · 12.4 KB
/
LowCovGene_identification.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
#$ -pe smp 5
#$ -l h_vmem=50G
source ~/.bashrc
echo =================================================================
echo "Script started $(date)"
DATE=`date +%d%m%Y`
echo =================================================================
logfile=$sample.lowcov.$DATE.log
exec > $logfile 2>&1
mkdir -p lowcov
cd lowcov
sample_names="<s1> <s2> <s3> ... <sn>" # Replace <s1> <s2> <s3> ... <sn> with your sample IDs separated by a space each. You need to provide sn.real.bam
ref="<ref>" # Replace <ref> with reference name for which you have ref.fasta, ref.gff (gff3 file containing annotions), ref.faa (amino acid seuqences for annotated proteins)
### info: this script uses a consensus reference for the samples. If you have one reference per sample, you need to replave all $ref in the code below with $sample
genome_files="/path/to/genome_files" # folder to your genome files ref.fasta, ref.gff, ref.faa
bamfolder="/path/to/bam_files" # folder to your sorted read alignment files sn.real.bam or downsampled 100x.sn.bam (in the latter case change input bam-file name in script)
tools="/path/to/tools" # folder to tools PhylaAmphora, ucsc tools
# step 1
# calculate coverage per base for each sample using bedtools
# required input files: bam file with reads aligned to reference, gff file with annotation
echo "1. coverage per base calculations for $sample_names"
for sample in $sample_names
do
echo "=> processing sample $sample"
bedtools coverage -abam $bamfolder/$sample.real.bam -b $genome_files/$ref.gff -d > CovPerBase_$sample.txt
awk -F ";Name=" '{print $1"\t"$NF}' CovPerBase_$sample.txt | awk -F "\t" '{print $9"\t"$NF}' > xx && mv xx CovPerBase_$sample.txt
echo -e "gene\tcoverage" | cat - CovPerBase_$sample.txt > xx && mv xx CovPerBase_$sample.txt
done
# step 2
# calculate coverage per gene for each sample using R
# required input files: CovPerBase file from step 1
echo "2. coverage per gene calculations for $sample_names"
for sample in $sample_names
do
echo "=> processing sample $sample"
echo "data <-read.table("INFILE.txt", sep="\t", header=TRUE)
genes = unique(data$gene, incomparables=FALSE)
outfile <- rbind(c("genes", "coverage"))
for(i in 1:(length(genes))){
positions <- which(data$gene%in%genes[i])
mean <- mean(data$coverage[positions])
outfile <- rbind(outfile, rbind(c((as.character(genes[i])), mean)))
}
write.table(outfile, sep="\t", file="OUTFILE.txt")
proc.time()" > ./AverageCovPerGene_$sample\temp.R
sed -i "s/INFILE/CovPerBase_$sample/g" AverageCovPerGene_$sample\temp.R
sed -i "s/OUTFILE/CovPerGene_$sample/g" AverageCovPerGene_$sample\temp.R
R CMD BATCH AverageCovPerGene_$sample\temp.R
rm AverageCovPerGene_$sample\temp.R
done
# step 3
# extract marker genes with phyla_AMPHORA
# input files: $ref.faa $ref.fa
echo "3. extracting marker genes using Phyla_AMPHORA for $sample_names"
for sample in $sample_names
do
mkdir phylamphgenes_$sample
cd phylamphgenes_$sample
perl $tools/Phyla_AMPHORA-master/Scripts/MarkerScanner.pl -Phylum 3 -DNA $genome_files/$ref.fa # I used Phylum 3 (gammaproteobacteria) - needs to be adjusted according to the target group
perl $tools/Phyla_AMPHORA-master/Scripts/MarkerAlignTrim.pl -WithReference -OutputFormat phylip
perl $tools/Phyla_AMPHORA-master/Scripts/Phylotyping.pl -CPUs 3 > $sample.phylotype.result
cat *.pep > $sample.all_markers.fa
grep ">" $sample.all_markers.fa | sed 's/>//g' > all_marker_ids.txt
makeblastdb -dbtype prot -in $sample.all_markers.fa
blastp -query $genome_files/$ref.faa -db $sample.all_markers.fa -evalue 0.0001 -out blast_$sample\_all_markers -outfmt 6
awk '{if ($3~"100.00") print $0}' blast_$sample\_all_markers > $sample.markers.txt
sed 's/ID=//g' $genome_files/$ref.gff | awk -F ";Name=" '{print $1"\t"$2}' | awk -F "\t" '{print $9"\t"$NF"\t"$4"\t"$5"\t"$1}' > $sample\_genelist
sed -i -e "1d" $sample\_genelist
sort -k1,1 $sample.markers.txt > xx && mv xx $sample.markers.txt
sort -k1,1 $sample\_genelist > xx && mv xx $sample\_genelist
awk '{print $1}' $sample.markers.txt | join - $sample\_genelist -t ' ' | uniq > PhyAMPH_genes100ID_$sample # TAB separator!!
cd ..
done
# step 4
# produce files for plotting and extracting genes/singlecopy genes based on their coverage
# required input files: CovPerBase file from step 2, phylamphgenes/$sample.markers.txt from step 3
echo "4. file formatting for $sample_names"
for sample in $sample_names
do
echo "=> processing sample $sample"
sed -i "1d" CovPerGene_$sample.txt
sed 's/"//g' CovPerGene_$sample.txt > xx && mv xx CovPerGene_$sample.txt
awk '{print $2"\t"$3}' CovPerGene_$sample.txt > xx && mv xx CovPerGene_$sample.txt
sort -k1,1 CovPerGene_$sample.txt > CovPerGene_$sample\_sorted.txt
awk -F ";Name=" '{print $1"\t"$2}' $genome_files/$ref.gff | awk -F "\t" '{print $(NF-1)"\t"$1"\t"$4"\t"$5"\t"$NF}' - | sort -k1,1 - | join - CovPerGene_$sample\_sorted.txt -t ' ' > CovPerGene_$sample\_annot.txt # ' ' has to be a tab
$tools/ucsc_tools-master/executables/faCount $genome_files/$ref.fa | awk '{if ($1!~"#" && $1!~"total") print $1"\t"$2}' | sort -k1,1 > contig.length.$sample
sort -k2,2 CovPerGene_$sample\_annot.txt | join -1 2 -2 1 - contig.length.$sample -t ' ' | awk -F "\t" '{print $2"\t"$1"\t"$3"\t"$4"\t"$7"\t"$5"\t"$6}' | sort -k1,1 > xx && mv xx CovPerGene_$sample\_annot.txt # ' ' has to be a tab
awk '{print $NF"\t"$0}' CovPerGene_$sample\_annot.txt | sort -n | awk -F "\t" '{print NR"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8}' > CovPerGene_$sample.plot
echo -e "order\tID\tcontig\tstart\tend\tcontlength\tgene\tcoverage" | cat - CovPerGene_$sample.plot > xx && mv xx CovPerGene_$sample.plot # echo -e
sort -k2,2 CovPerGene_$sample.plot > CovPerGene_$sample.plot.sort
awk '{if ($4>=100) print $1}' phylamphgenes_$sample/$sample.markers.txt | sed 's/fig|/ID=fig|/g' | sort | uniq | sort -k1,1 | join -1 1 -2 2 - CovPerGene_$sample.plot.sort -t ' ' > CovPerGene_$sample.singlecopy.plot # ' ' has to be a tab
echo -e "ID\torder\tcontig\tstart\tend\tcontlength\tannotation\tcoverage" | cat - CovPerGene_$sample.singlecopy.plot > xx && mv xx CovPerGene_$sample.singlecopy.plot # echo -e
awk '{if ($0!~"hypothetical protein") print $0}' CovPerGene_$sample.singlecopy.plot > xx && mv xx CovPerGene_$sample.singlecopy.plot
awk '{if ($4>=100 || $5<=($6-100)) print $0}' CovPerGene_$sample.singlecopy.plot > xx && mv xx CovPerGene_$sample.singlecopy.plot
#### duplicate annotations removal:
awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$8"\t"$7}' CovPerGene_$sample.singlecopy.plot | sed 's/ /xxx/g' | sort -k8 | uniq -u -f7 | sed 's/xxx/ /g' | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$8"\t"$7}' | sort -k1 -r > xx && mv xx CovPerGene_$sample.singlecopy.plot
done
# step 5
# define low coverage genes using R script
# required input files: CovPerGene_$sample.plot CovPerGene_$sample.singlecopy.plot
echo "5. define low coverage genes using R script for $sample_names"
echo "creating RplottingLowCov.R"
echo "library(ggplot2, lib.loc="'"~/R/"'")" > RplottingLowCov.R
echo "library(data.table, lib.loc="'"~/R/"'")" >> RplottingLowCov.R
echo "library(labeling, lib.loc="'"~/R/"'")" >> RplottingLowCov.R
echo "library(digest, lib.loc="'"~/R/"'")" >> RplottingLowCov.R
for sample in $sample_names
do
echo "" >> RplottingLowCov.R
echo "AllGenes"$sample" <- fread('./CovPerGene_"$sample".plot')" >> RplottingLowCov.R
echo "SingleCopy"$sample" <- fread('./CovPerGene_"$sample".singlecopy.plot')" >> RplottingLowCov.R
echo "setnames(AllGenes"$sample", 'gene', 'ID')" >> RplottingLowCov.R
echo "AllGenes"$sample"[ , Type := 'AllGenes']" >> RplottingLowCov.R
echo "SingleCopy"$sample"[ , Type := 'SingleCopy']" >> RplottingLowCov.R
echo "str(AllGenes"$sample")" >> RplottingLowCov.R
echo "str(SingleCopy"$sample")" >> RplottingLowCov.R
echo "bind."$sample" <- rbind(AllGenes"$sample", SingleCopy"$sample", fill = TRUE)" >> RplottingLowCov.R
done
for sample in $sample_names
do
echo "
library(gridExtra, lib.loc="'"~/R/"'")
get_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "'"guide-box"'")
legend <- tmp$grobs[[leg]]
return(legend)
}" >> RplottingLowCov.R
done
for sample in $sample_names
do
echo "
box"$sample" <- ggplot(
data = "bind.$sample",
mapping = aes(
x = Type,
y = coverage,
fill = Type
)
) + geom_boxplot(
outlier.size = 1,
#outlier.colour = 'black'
) + theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
legend.position="'"none"'"
) + scale_y_log10()
scat"$sample" <- ggplot(
data = "bind.$sample",
mapping = aes(
x = order,
y = coverage,
colour = Type
)
) + geom_point(
size = 1
) + theme(
legend.position="'"right"'",
legend.title=element_blank(),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()
) + scale_y_log10()
">> RplottingLowCov.R
done
echo "" >> RplottingLowCov.R
insamp=$(for sample in $sample_names; do echo "scat"$sample", box"$sample","; done | tr '\n' ' ')
for sample in $sample_names
do
echo "scat"$sample" <- scat"$sample" + theme(legend.position="'"none"'")" >> RplottingLowCov.R
done
myvar=$(for sample in $sample_names ; do echo $sample; done | wc -l)
echo "myvar: "$myvar
myvar2=$(echo "scale=1; $myvar/2+0.0005" | bc)
echo "myvar2: "$myvar2
nrow=$(printf "%.0f\n" $(echo $myvar2 | bc -l))
echo "nrow: "$nrow
widths=$(for ((i=1; i<=2; i++)); do printf "%0.s62"; done | sed 's/62/6, 2/g' | sed 's/26/2, 6/g')
echo "widths: "$widths
heights=$(for ((i=1; i<=$nrow; i++)); do printf "%0.s10"; done | sed 's/01/0, 1/g')
echo "heights: "$heights
col=$(if [[ $myvar == 1 ]] ; then echo "2"; else echo "4"; fi)
echo "col: "$col
#echo "grid.arrange("$insamp" legend, ncol = "$col", nrow = "$nrow", widths=c("$widths"), heights=c("$heights"))" >> RplottingLowCov.R
echo "grid.arrange("$insamp" ncol = "$col", nrow = "$nrow", widths=c("$widths"), heights=c("$heights"))" >> RplottingLowCov.R
echo -e "\n"
for sample in $sample_names
do
echo "stats"$sample" <- boxplot.stats(SingleCopy"$sample""'$'"coverage
)"'$'"stats
write.table(stats"$sample",sep="'"\t"'",file="'"./stats'$sample'"'")
out"$sample" <- boxplot.stats(SingleCopy"$sample""'$'"coverage
)"'$'"out
write.table(out"$sample",sep="'"\t"'",file="'"./out'$sample'"'")
"
done >> RplottingLowCov.R
######
echo "run RplottingLowCov.R"
R CMD BATCH RplottingLowCov.R
mv Rplots.pdf $ref.plots.pdf
#####
# step 6
# extract low coverage genes with annotation, count them, separate from hypotheticals
# required input files: stats$sample from step 5, CovPerGene_$sample\_annot.txt from step 4
echo "6. extract low coverage genes with annotation, count them, separate from hypotheticals for $sample_names"
for sample in $sample_names
do
echo "=> extracting low coverage genes from sample $sample"
lowcov=`awk '{if ($1=="\"1\"") print $2}' stats$sample`
awk -v var=$lowcov -F "\t" '{if($7<=var) print $0}' CovPerGene_$sample\_annot.txt | sed 's/ID=//g' > LowCovGenes_$sample
awk -v var=$lowcov -F "\t" '{if($7<=var) print $0}' CovPerGene_$sample\_annot.txt | sed 's/ID=//g' | grep -v "hypothetical protein" | sort | uniq > LowCovGenes_noHypo_$sample
done
## make amino acid fasta files for non-hypothetical low coverage outliers
for sample in $sample_names
do
echo "=> make amino acid fasta files for non-hypothetical low coverage outliers from sample $sample"
grep -v "hypothetical protein" LowCovGenes_$sample | awk '{print $1}' | grep ".peg." > LowCovGeneID_$sample
xargs samtools faidx $genome_files/$ref.faa < LowCovGeneID_$sample > LowCovGenes_$sample.fa
done
## write hypothetical and non-hypothetical low coverage outlier numbers to file
echo "Low coverage genes $sample" > LowCovCat.txt
for sample in $sample_names
do
echo "=> write hypothetical and non-hypothetical low coverage outlier numbers to file from sample $sample"
echo "$sample:" >> LowCovCat.txt
echo " " >> LowCovCat.txt
echo "hypotheticals" >> LowCovCat.txt
hyp=`grep "hypothetical protein" LowCovGenes_$sample | sort | uniq | wc -l`
echo $hyp >> LowCovCat.txt
echo "annotated" >> LowCovCat.txt
hyp=`grep -v "hypothetical protein" LowCovGenes_$sample | sort | uniq | wc -l`
echo $hyp >> LowCovCat.txt
echo "------------" >> LowCovCat.txt
done
echo =================================================================
echo "Script finished $(date)"
DATE=`date +%d%m%Y`
echo =================================================================