Skip to content

Commit

Permalink
v2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
smonger committed Oct 4, 2019
1 parent 5d4c105 commit d70ac05
Show file tree
Hide file tree
Showing 12 changed files with 1,418 additions and 3,543 deletions.
195 changes: 123 additions & 72 deletions RUN.sh
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
#!/bin/bash
function printVersion {
printf "Spliceogen 1.0 1-March-2019\n"
printf "Spliceogen 2.0 1-October-2019\n"
}
function printHelp {
cat <<-END
Usage:
------
3 required args:
1) -inputVCF path/to/VCF/input(s).VCF
OR
-inputBED path/to/input(s).BED
Note: wildcard matching of multiple files is allowed
2) -gtf path/to/annotation.GTF
3) -fasta path/to/genome.fasta
1) -input path/to/VCF/input/file(s).VCF
Note: multiple input files are accepted "eg. -input *.vcf"
Note: deprecated v1.0 tags "inputVCF" and "inputBED" are still accepted
2) -gtf path/to/annotation.gtf
3) -fasta path/to/genome.fa
optional arg:
4) -branchpointer hgXX
OR
Expand All @@ -22,8 +21,6 @@ END
}
#set default parameters
POSITIONAL=()
INPUTVCF="FALSE"
INPUTBED="FALSE"
INPUTFILES=""
USEBP=""
USEBPINDELS=""
Expand All @@ -43,8 +40,16 @@ case $key in
printVersion
exit 0
;;
-input)
INPUTFILES="$2"
shift
shift
while [ "$1" ] && [[ ! $1 == *-* ]]; do
INPUTFILES="$INPUTFILES $1"
shift
done
;;
-inputVCF)
INPUTVCF="TRUE"
INPUTFILES="$2"
shift
shift
Expand All @@ -54,7 +59,6 @@ case $key in
done
;;
-inputBED)
INPUTBED="TRUE"
INPUTFILES="$2"
shift
shift
Expand Down Expand Up @@ -97,36 +101,26 @@ elif [ ! -f "$ANNOTATION" ]; then
echo "GTF annotation file not found: use -gtf path/to/gencodeXX.gtf\nExiting..."
exit 1
fi
checkGzip=$( file --mime-type "$FASTAPATH" "$ANNOTATION" "$INPUTFILES" | grep gzip)
if [ ! "$checkGzip" = "" ]; then
echo "Error: Input FASTA and GTF files must be unzipped. Exiting..."
exit 1
fi
#check input files for consistenty in "chr" nomenlature
gtfChr=$(head "$ANNOTATION" | tail -1 | awk '{print $1}' | grep chr)
fastaChr=$(head -1 "$FASTAPATH" | awk '{print $1}' | grep chr)
firstInputFile=$(echo "$INPUTFILES" | awk '{print $1}')
inputChr=$(tail -1 "$firstInputFile" | awk '{print $1}' | grep chr)
warningChr="Warning: it appears the provided gtf, fasta, and input files use inconsistent Chromosome nomenclature. Eg. \"chr1\" vs \"1\". This will likely cause issues. Please edit them for consistency"
if [ "$gtfChr" == "" ]; then
#check input gtf/fasta "chr" nomenclature
gtfChr=$(zcat -f "$ANNOTATION" | grep -v '^GL000' | tail -1 | awk '{print $1}' | grep chr)
fastaChr=$(cat "$FASTAPATH" | head -1 | awk '{print $1}' | grep chr)
gtfChrAdd=""
gtfChrRemove="UnmatchedString"
if [ "$fastaChr" != "" ]; then
echo "$warningChr"
elif [ "$inputChr" != "" ]; then
echo "$warningChr"
if [ "$gtfChr" == "" ]; then
gtfChrAdd="chr"
fi
elif [ "$fastaChr" == "" ]; then
if [ "$gtfChr" != "" ]; then
gtfChrRemove="chr"
fi
fi
else
if [ "$fastaChr" == "" ]; then
echo "$warningChr"
elif [ "$inputChr" == "" ]; then
echo "$warningChr"
fi
fi
#prepare splice site intervals from annotation.gtf
gtfBasename=$(basename $ANNOTATION)
if [ ! -f data/"$gtfBasename"_SpliceSiteIntervals.txt ] || [[ "$ANNOTATION" -nt data/"$gtfBasename"_SpliceSiteIntervals.txt ]] ; then
echo "Preparing splice site annotation..."
grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' "$ANNOTATION" | grep -v '^GL000' |
java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt
zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' | grep -v '^GL000' |
sed "s/$gtfChrRemove//" | awk -v var="$gtfChrAdd" '{print var$0}' | java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt
fi
#for each input VCF/BED file
for FILE in $INPUTFILES; do
Expand All @@ -139,9 +133,39 @@ for FILE in $INPUTFILES; do
echo "Input file: $fileID"
#remove temp files from any previous run
rm temp/"$fileID"* 2> /dev/null
#check input file type
FILETYPE=""
nFields=$(zcat -f $FILE | tail -1 | wc -w)
vcfHeader=$(zcat -f $FILE | head -1 | grep VCF)
if [ "$nFields" -eq 4 ]; then
FILETYPE="TSV"
elif [ ! -z "$vcfHeader" ]; then
FILETYPE="VCF"
else
FILETYPE="BED"
fi
echo "File type: $FILETYPE"
#correct mismatches in "chr" nomenclature among input files
inputChr=$(zcat -f "$FILE" | tail -1 | awk '{print $1}' | grep chr)
inputChrAdd=""
inputChrRemove="UnmatchedString"
if [ "$fastaChr" != "" ]; then
if [ "$inputChr" == "" ]; then
inputChrAdd="chr"
fi
elif [ "$fastaChr" == "" ]; then
if [ "$inputChr" != "" ]; then
inputChrRemove="chr"
fi
fi
#sort body of input file
grep "^#" "$FILE" > temp/"$fileID"_sorted
grep -v "^#" "$FILE" | sort -k1,1 -k2,2n >> temp/"$fileID"_sorted
zcat -f "$FILE" | grep "^#" > temp/"$fileID"_sorted
if [ "$FILETYPE" == "TSV" ]; then
zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$1, $2, $2, "x", "1", ".", $3, $4}' >> temp/"$fileID"_sorted
else
#zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | sed "/^/$inputChrAdd/" >> temp/"$fileID"_sorted
zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$0}' >> temp/"$fileID"_sorted
fi
#check bedtools is installed
bedtoolsLocation=$(which bedtools);
if [ "$bedtoolsLocation" == "" ]; then
Expand All @@ -157,7 +181,7 @@ for FILE in $INPUTFILES; do
fi
#bedtools intersect to get strand info
echo "Retrieving strand info..."
grep '[[:blank:]]gene[[:blank:]]' "$ANNOTATION" | sort -k1,1 -k4,4n | grep -v '^GL000' | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted > temp/"$fileID"unstrandedInput.txt
zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]' | sort -k1,1 -k4,4n | grep -v '^GL000' | awk -v var="$gtfChrAdd" -v OFS="\\t" '{print var$0}' | sed "s/$gtfChrRemove//" | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted > temp/"$fileID"unstrandedInput.txt
if [ $? -ne 0 ]; then
echo "Warning. Bedtools intersect returned non-zero exit status. Intersection failed between provided variant VCF/BED file and provided GTF. See above error message for more details"
fi
Expand All @@ -166,10 +190,10 @@ for FILE in $INPUTFILES; do
exit 1
fi
#generate flanking intervals.bed for bedtools getfasta and branchpointer input
if [ "$INPUTVCF" = "TRUE" ]; then
if [ "$FILETYPE" = "VCF" ]; then
grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $4, $5}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed
grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $4, $5}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed
elif [ "$INPUTBED" = "TRUE" ]; then
elif [ "$FILETYPE" = "TSV" ] || [ "$FILETYPE" = "BED" ] ; then
grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $7, $8}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed
grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $7, $8}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed
fi
Expand All @@ -185,25 +209,42 @@ for FILE in $INPUTFILES; do
fi
#seqScan: generates input strings for maxentscan and genesplicer as well as ESRseq scores
echo "Scanning for motifs..."
rm output/mesOmmitted/"$fileID" 2> /dev/null
rm output/"$fileID"mesOmmitted.txt 2> /dev/null
rm output/"$fileID"refMismatch.txt 2> /dev/null
java -cp bin seqScan temp/"$fileID"seqToScan.FASTA -useESR $fileID 1>&2
if [ -s output/"$fileID"refMismatch.txt ]; then
refMismatchCount=$(wc -l output/"$fileID"refMismatch.txt | awk '{print $1}')
echo "Note: $refMismatchCount variants were excluded because the provided Reference allele does not match the nucleotide(s) in the provided FASTA. IDs of excluded variant(s) are outputted here: Spliceogen/output/""$fileID""refMismatch.txt"
fi
if [ -s output/"$fileID"mesOmmitted.txt ]; then
mesOmmittedCount=$(wc -l output/"$fileID"mesOmmitted.txt | awk '{print $1}')
echo "Note: $mesOmmittedCount variants were excluded from MaxEntScan because their flanking FASTA sequence contains invalid characters (most commonly \"n\"), which cannot be processed by MaxEntScan. IDs of ommitted variant(s) are listed in: Spliceogen/output/""$fileID""mesOmmitted.txt"
fi
#run maxEntScan and confirm non-zero exit, since invalid inputs cause it to exit early
echo "Running MaxEntScan..."
perl score5.pl temp/"$fileID"mesDonorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt
retVal=( ${PIPESTATUS[0]} )
if [ $retVal -ne 0 ]; then
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
exit $retVal
fi
perl score3.pl temp/"$fileID"mesAcceptorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt
retVal=( ${PIPESTATUS[0]} )
if [ $retVal -ne 0 ]; then
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
exit $retVal
if [ -s temp/"$fileID"mesDonorInput.txt ] || [ -s temp/"$fileID"mesAcceptorInput.txt ] ; then
echo "Running MaxEntScan..."
perl score5.pl temp/"$fileID"mesDonorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt
retVal=( ${PIPESTATUS[0]} )
if [ $retVal -ne 0 ]; then
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
exit $retVal
fi
perl score3.pl temp/"$fileID"mesAcceptorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt
retVal=( ${PIPESTATUS[0]} )
if [ $retVal -ne 0 ]; then
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
exit $retVal
fi
else
echo "No input for MaxEntScan"
fi
#run genesplicer
echo "Running GeneSplicer..."
bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt
if [ -s temp/"$fileID"gsInput.FASTA ] ; then
echo "Running GeneSplicer..."
bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt
else
echo "No input for GeneSplicer"
fi
#run branchpointer SNPs
if [ "$USEBP" = "TRUE" -a "$USEBPINDELS" = "FALSE" ]; then
echo "Running Branchpointer..."
Expand All @@ -226,25 +267,35 @@ for FILE in $INPUTFILES; do
#awk -v OFS=\\t '{print $2, $3, $4, $8, $9, $15, $16, $22, $23, $24, $25}' output/"$fileID"bpOutputSNPs.txt" > output/bpSNPsSummarised.txt
fi
#merge scores into one line
echo "Processing scores..."
cat temp/"$fileID"mesDonorScores.txt temp/"$fileID"mesAcceptorScores.txt temp/"$fileID"gsScores.txt temp/"$fileID"ESRoutput.txt data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt |
sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID"
#sort predictions
if [ -s temp/"$fileID"_donorCreating_unsorted.txt ]; then
sort -gr -k11,11 temp/"$fileID"_donorCreating_unsorted.txt >> output/"$fileID"_donorCreating.txt
else
rm output/"$fileID"_donorCreating.txt
fi
if [ -s temp/"$fileID"_acceptorCreating_unsorted.txt ]; then
sort -gr -k11,11 temp/"$fileID"_acceptorCreating_unsorted.txt >> output/"$fileID"_acceptorCreating.txt
scoresToMerge=""
if [ -s temp/"$fileID"gsScores.txt ] ; then
scoresToMerge="temp/"$fileID"gsScores.txt"
fi
if [ -s temp/"$fileID"mesDonorScores.txt ] ; then
scoresToMerge="$scoresToMerge temp/"$fileID"mesDonorScores.txt"
fi
if [ -s temp/"$fileID"mesAcceptorScores.txt ] ; then
scoresToMerge="$scoresToMerge temp/"$fileID"mesAcceptorScores.txt"
fi
if [ -s temp/"$fileID"ESRoutput.txt ] ; then
scoresToMerge="$scoresToMerge temp/"$fileID"ESRoutput.txt"
fi
checkScoresExist=$(echo "$scoresToMerge" | grep "temp")
if [ -z "$checkScoresExist" ]; then
echo "No MaxEntScan/GeneSplicer/ESRseq scores to process"
else
rm output/"$fileID"_acceptorCreating.txt
echo "Processing scores..."
cat $(echo "$scoresToMerge") data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt | sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID"
fi
#sort predictions
if [ -s temp/"$fileID"_gain_unsorted.txt ]; then
echo -e "#CHR\tSTART\tREF\tALT\tGENE\tdonGainP\taccGainP" > output/"$fileID"_ssGain.txt
sort -gr -k8,8 temp/"$fileID"_gain_unsorted.txt | cut -f1-7 >> output/"$fileID"_ssGain.txt
fi
if [ -s temp/"$fileID"_withinSS_unsorted.txt ]; then
sort -gr -k17,17 temp/"$fileID"_withinSS_unsorted.txt | cut -f1-15 >> output/"$fileID"_withinSS.txt
else
rm output/"$fileID"_withinSS.txt
if [ -s temp/"$fileID"_loss_unsorted.txt ]; then
echo -e "#CHR\tSTART\tREF\tALT\tGENE\twithinSS\tdonGainP\taccGainP" > output/"$fileID"_withinSS.txt
sort -gr -k9,9 temp/"$fileID"_loss_unsorted.txt | cut -f1-8 >> output/"$fileID"_withinSS.txt
fi
#clean up temp files
rm temp/"$fileID"* 2> /dev/null
#rm temp/"$fileID"* 2> /dev/null
done
Binary file modified bin/mergeOutput.class
Binary file not shown.
Loading

0 comments on commit d70ac05

Please sign in to comment.