-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathidrWoControl
executable file
·208 lines (181 loc) · 9.8 KB
/
idrWoControl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/bin/bash
#PBS -l nodes=1:ppn=4
## initialize variables with default values
MACS_P_VALUE="1e-3"
IDR_THRESHOLD="0.01"
PROGDIR="/home/pundhir/software/idrCode"
GENOME="mm"
#### usage ####
usage() {
echo Program: "idrWoControl (run IDR analysis for ChIP-seq data, if control or replicates not available)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: idr -i <file(s)> -o <dir> [OPTIONS]"
echo "Options:"
echo " -i <file> [mapped ChIP file(s) in BAM format (separated by comma)]"
echo " [format: <identical file name>_Rep[1|2].bam]"
echo " [should be ABSOLUTE path]"
echo " -o <dir> [output directory (should be ABSOLUTE path)]"
echo "[OPTIONS]"
echo " -c <file> [mapped Control file(s) in BAM format (separated by comma)]"
echo " [format: <identical file name>_Rep[1|2].bam]"
echo " [should be ABSOLUTE path]"
echo " -p <dir> [path to dependent R scripts (default: /home/pundhir/software/idrCode)]"
echo " -t <float> [IDR threshold (default: 0.01)]"
echo " -g <string> [effective genome size, required by macs2 (default: mm)]"
echo " [availble: hs|mm|ce|dm]"
echo " -h [help]"
echo
exit 0
}
## function to compute the factorial of a number
factorial(){
fact=1
n=$1
while [ $n -ge 1 ]; do
fact=`expr $fact \* $n`
n=`expr $n - 1`
done
echo "$fact"
}
#### parse options ####
while getopts i:o:c:p:t:g:h ARG; do
case "$ARG" in
i) CHIP_INPUT=$OPTARG;;
o) OUTDIR=$OPTARG;;
c) CONTROL_INPUT=$OPTARG;;
p) PROGDIR=$OPTARG;;
t) IDR_THRESHOLD=$OPTARG;;
g) GENOME=$OPTARG;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories and not given/exist
if [ ! "$CHIP_INPUT" -o ! "$OUTDIR" -o "$HELP" ]; then
usage
fi
## create appropriate directory structure
echo
echo -n "Create appropriate directory structure.. "
if [ ! -d "$OUTDIR/homer" ]; then
mkdir -p $OUTDIR/macs
mkdir -p $OUTDIR/quality
mkdir -p $OUTDIR/logs
mkdir -p $OUTDIR/consistency
mkdir -p $OUTDIR/consistency/reps
fi
echo "done"
###############################################
## determine number of input ChIP samples
IFS=","
TMP=($CHIP_INPUT)
CHIP_COUNT=${#TMP[@]}
IFS=" "
## determine common id of the input ChIP samples
CHIP_ID=`echo $TMP[0] | sed 's/^.*\///g' | sed 's/Rep.*//g'`
## determine directory of the input ChIP/control sample files
INDIR=`echo $TMP[0] | perl -ane 'if($_=~/\//) { $_=~s/\/[^\/]+$//g; print $_; } else { print "."; }'`
echo "Total ChIP samples: $CHIP_COUNT ($CHIP_ID)"
## determine number of input control samples
if [ ! -z "$CONTROL_INPUT" ]; then
IFS=","
TMP=($CONTROL_INPUT)
CONTROL_COUNT=${#TMP[@]}
IFS=" "
## determine common id of the input control samples
CONTROL_ID=`echo $TMP[0] | sed 's/^.*\///g' | sed 's/Rep.*//g'`
echo "Total control samples: $CONTROL_COUNT ($CONTROL_ID)"
fi
##########################################################################
############ CALL PEAKS ON INDIVIDUAL REPLICATES
##########################################################################
<<"COMMENT1"
COMMENT1
## determine fragment length for all ChIP samples using macs
echo -n "Determine fragment length of each ChIP sample.. "
FRAGMENT_LENGTH=(0)
for (( i=1; i<=$CHIP_COUNT; i++ )); do
if [ ! -f "$OUTDIR/logs/$CHIP_ID"Rep"$i.predictd" ]; then
macs2 predictd -i $INDIR/$CHIP_ID"Rep"$i.bam -g $GENOME --outdir $OUTDIR/logs 2> $OUTDIR/logs/$CHIP_ID"Rep"$i.predictd
fi
FRAGMENT_LENGTH[$i]=`grep "predicted fragment length" $OUTDIR/logs/$CHIP_ID"Rep"$i.predictd | perl -ane 'print $F[scalar(@F)-2];'`
done
echo "done"
<<"COMMENT"
COMMENT
## peak calling on all the ChIP samples using macs
echo -n "Call peaks on each ChIP sample.. "
for (( i=1; i<=$CHIP_COUNT; i++ )); do
#SHIFT_SIZE=$(echo "scale=0; ${FRAGMENT_LENGTH[$i]}/2" | bc)
SHIFT_SIZE=`echo $(( FRAGMENT_LENGTH[$i] ))`
if [ ! -f "$OUTDIR/macs/$CHIP_ID"Rep"$i""_peaks.narrowPeak" ]; then
if [ ! -z "$CONTROL_INPUT" ]; then
macs2 callpeak -t $INDIR/$CHIP_ID"Rep"$i.bam -c $INDIR/$CONTROL_ID"Rep"$i.bam -n $CHIP_ID"Rep"$i -g $GENOME -p $MACS_P_VALUE --nomodel --extsize $SHIFT_SIZE --outdir $OUTDIR/macs/ &>$OUTDIR/logs/$CHIP_ID"Rep"$i.log
else
macs2 callpeak -t $INDIR/$CHIP_ID"Rep"$i.bam -n $CHIP_ID"Rep"$i -g $GENOME -p $MACS_P_VALUE --nomodel --extsize $SHIFT_SIZE --outdir $OUTDIR/macs/ &>$OUTDIR/logs/$CHIP_ID"Rep"$i.log
fi
fi
done
echo "done"
##########################################################################
############ IDR ANALYSIS ON ORIGINAL CHIP SAMPLES
##########################################################################
## move to idrCode directory
cd $PROGDIR
if [ "$CHIP_COUNT" -gt 1 ]; then
## IDR analysis on the original ChIP samples
echo -n "IDR analysis on the original ChIP samples.. "
COMMAND=""
for (( i=1; i<=$CHIP_COUNT; i++ )); do
for (( j=i+1; j<=$CHIP_COUNT; j++ )); do
Rscript batch-consistency-analysis.r $OUTDIR/macs/$CHIP_ID"Rep"$i"_peaks.narrowPeak" $OUTDIR/macs/$CHIP_ID"Rep"$j"_peaks.narrowPeak" -1 $OUTDIR/consistency/reps/$CHIP_ID"Rep"$i"_Vs_"$CHIP_ID"Rep"$j 0 F p.value
COMMAND="$COMMAND "$OUTDIR/consistency/reps/$CHIP_ID"Rep"$i"_Vs_"$CHIP_ID"Rep"$j
done
done
## plot IDR plots
numerator=$( factorial $CHIP_COUNT );
denominator=$(( 2 * $( factorial `expr $CHIP_COUNT - 2` ) ));
plot_argument=`expr $numerator / $denominator`;
#echo -e "$numerator\t$denominator\t$plot_argument";
Rscript batch-consistency-plot.r $plot_argument $OUTDIR/consistency/reps/chipSampleAllReps $COMMAND 2>/dev/null
echo "done"
fi
## compute enrichment and quality measure for input ChIP-seq data
echo -n "Compute enrichment and quality measure for input ChIP-seq data.. "
if [ -e "$PROGDIR/phantompeakqualtools/run_spp.R" ]; then
for (( i=1; i<=$CHIP_COUNT; i++ )); do
#Rscript $PROGDIR/phantompeakqualtools/run_spp.R -c=$INDIR/$CHIP_ID"Rep"$i.bam -fdr=$IDR_THRESHOLD -savp -odir=$OUTDIR/quality -out=$OUTDIR/quality/quality.txt -tmpdir=$OUTDIR/quality &> $OUTDIR/logs/quality.log
samtools view -b -F 1548 -q 30 $INDIR/$CHIP_ID"Rep"$i.bam | bamToBed -i stdin | awk 'BEGIN{S="\t";OFS="\t"}{$4="N";print $0}' | gzip -c > $OUTDIR/quality/$CHIP_ID"Rep"$i.tagAlign.gz
Rscript $PROGDIR/phantompeakqualtools/run_spp.R -c=$OUTDIR/quality/$CHIP_ID"Rep"$i.tagAlign.gz -fdr=$IDR_THRESHOLD -savp -odir=$OUTDIR/quality -out=$OUTDIR/quality/quality.txt &> $OUTDIR/logs/quality.log
rm $OUTDIR/quality/$CHIP_ID"Rep"$i.tagAlign.gz
done
fi
echo "done"
## create final output file in BED format
echo -n "Create final output file in BED format.. "
if [ "$CHIP_COUNT" -gt 2 ]; then
COMMAND=""
for (( i=1; i<=$CHIP_COUNT; i++ )); do
for (( j=i+1; j<=$CHIP_COUNT; j++ )); do
## IDR output sometimes contains start coordinate higher than the end coordinate
#cat $OUTDIR/consistency/reps/$CHIP_ID"Rep"$i"_Vs_"$CHIP_ID"Rep"$j"-overlapped-peaks.txt" | grep -v start | perl -ane '$_=~s/\"//g; @t=split(/\s+/, $_); $start=$t[2]; if($t[2]>$t[6]) { $start=$t[6]; } $end=$t[3]; if($t[3]<$t[7]) { $end=$t[7]; } $mean_signal=sprintf("%0.5f", ($t[4]+$t[8])/2); print "$t[1]\t$start\t$end\t$mean_signal\t$t[9]\t$t[10]\n";' | perl -ane 'if($F[1]<$F[2]) { print $_; }' > $OUTDIR/consistency/reps/$CHIP_ID"Rep"$i"_Vs_"$CHIP_ID"Rep"$j".bed"
cat $OUTDIR/consistency/reps/$CHIP_ID"Rep"$i"_Vs_"$CHIP_ID"Rep"$j"-overlapped-peaks.txt" | grep -v start | perl -ane '$_=~s/\"//g; @t=split(/\s+/, $_); $start=$t[2]; if($t[2]>$t[6]) { $start=$t[6]; } $end=$t[3]; if($t[3]<$t[7]) { $end=$t[7]; } $mean_signal=sprintf("%0.5f", ($t[4]+$t[8])/2); print "$t[1]\t$start\t$end\t$mean_signal\t$t[9]\t$t[10]\n";' | perl -ane 'if($F[1]<$F[2] && $F[7]<'$IDR_THRESHOLD') { print $_; }' > $OUTDIR/consistency/reps/$CHIP_ID"Rep"$i"_Vs_"$CHIP_ID"Rep"$j".bed"
COMMAND="$COMMAND "$OUTDIR/consistency/reps/$CHIP_ID"Rep"$i"_Vs_"$CHIP_ID"Rep"$j".bed,"
done
done
multiIntersectBed.sh -i $COMMAND | perl -ane '$i++; print "$F[0]\t$F[1]\t$F[2]\tpeak_$i\t-1\t.\t-1\t-1\n";' | sort -k 1,1 -k 2n,2 -k 3n,3 > $OUTDIR/$CHIP_ID"_peaks.bed"
elif [ "$CHIP_COUNT" -gt 1 ]; then
## IDR output sometimes contains start coordinate higher than the end coordinate
#cat $OUTDIR/consistency/reps/$CHIP_ID"Rep1_Vs_"$CHIP_ID"Rep2-overlapped-peaks.txt" | grep -v start | perl -ane '$_=~s/\"//g; @t=split(/\s+/, $_); $start=$t[2]; if($t[2]>$t[6]) { $start=$t[6]; } $end=$t[3]; if($t[3]<$t[7]) { $end=$t[7]; } $mean_signal=sprintf("%0.5f", ($t[4]+$t[8])/2); $i++; print "$t[1]\t$start\t$end\tpeak_$i\t$mean_signal\t.\t$t[9]\t$t[10]\n";' | perl -ane 'if($F[1]<$F[2]) { print $_; }' | sort -k 1,1 -k 2n,2 -k 3n,3 > $OUTDIR/$CHIP_ID"peaks.bed"
cat $OUTDIR/consistency/reps/$CHIP_ID"Rep1_Vs_"$CHIP_ID"Rep2-overlapped-peaks.txt" | grep -v start | perl -ane '$_=~s/\"//g; @t=split(/\s+/, $_); $start=$t[2]; if($t[2]>$t[6]) { $start=$t[6]; } $end=$t[3]; if($t[3]<$t[7]) { $end=$t[7]; } $mean_signal=sprintf("%0.5f", ($t[4]+$t[8])/2); $i++; print "$t[1]\t$start\t$end\tpeak_$i\t$mean_signal\t.\t$t[9]\t$t[10]\n";' | perl -ane 'if($F[1]<$F[2] && $F[7]<'$IDR_THRESHOLD') { print $_; }' | sort -k 1,1 -k 2n,2 -k 3n,3 > $OUTDIR/$CHIP_ID"peaks.bed"
else
cat $OUTDIR/macs/$CHIP_ID"Rep1_peaks.narrowPeak" | perl -ane '$i++; print "$F[0]\t$F[1]\t$F[2]\tpeak_$i\t$F[4]\t$F[5]\t$F[7]\t$F[8]\n";' | sort -k 1,1 -k 2n,2 -k 3n,3 > $OUTDIR/$CHIP_ID"peaks.bed"
fi
## IDR output sometimes contains start coordinate higher than the end coordinate
#cat $OUTDIR/$CHIP_ID"peaks.bed" | perl -ane 'if($F[1]<$F[2]) { print $_; print '$CHIP_ID'."\n"; }'|
echo "done"
#for (( i=1; i<=$CHIP_COUNT; i++ )); do
# makeTagDirectory $OUTDIR/homer/$CHIP_ID"Rep"$i $INDIR/$CHIP_ID"Rep"$i.bam
#done
exit