diff --git a/RUN.sh b/RUN.sh index afc2eca..3975f5d 100755 --- a/RUN.sh +++ b/RUN.sh @@ -1,18 +1,17 @@ #!/bin/bash function printVersion { - printf "Spliceogen 1.0 1-March-2019\n" + printf "Spliceogen 2.0 1-October-2019\n" } function printHelp { cat <<-END Usage: ------ 3 required args: -1) -inputVCF path/to/VCF/input(s).VCF - OR - -inputBED path/to/input(s).BED - Note: wildcard matching of multiple files is allowed -2) -gtf path/to/annotation.GTF -3) -fasta path/to/genome.fasta +1) -input path/to/VCF/input/file(s).VCF + Note: multiple input files are accepted "eg. -input *.vcf" + Note: deprecated v1.0 tags "inputVCF" and "inputBED" are still accepted +2) -gtf path/to/annotation.gtf +3) -fasta path/to/genome.fa optional arg: 4) -branchpointer hgXX OR @@ -22,8 +21,6 @@ END } #set default parameters POSITIONAL=() -INPUTVCF="FALSE" -INPUTBED="FALSE" INPUTFILES="" USEBP="" USEBPINDELS="" @@ -43,8 +40,16 @@ case $key in printVersion exit 0 ;; + -input) + INPUTFILES="$2" + shift + shift + while [ "$1" ] && [[ ! $1 == *-* ]]; do + INPUTFILES="$INPUTFILES $1" + shift + done + ;; -inputVCF) - INPUTVCF="TRUE" INPUTFILES="$2" shift shift @@ -54,7 +59,6 @@ case $key in done ;; -inputBED) - INPUTBED="TRUE" INPUTFILES="$2" shift shift @@ -97,36 +101,26 @@ elif [ ! -f "$ANNOTATION" ]; then echo "GTF annotation file not found: use -gtf path/to/gencodeXX.gtf\nExiting..." exit 1 fi -checkGzip=$( file --mime-type "$FASTAPATH" "$ANNOTATION" "$INPUTFILES" | grep gzip) -if [ ! "$checkGzip" = "" ]; then - echo "Error: Input FASTA and GTF files must be unzipped. Exiting..." - exit 1 -fi -#check input files for consistenty in "chr" nomenlature -gtfChr=$(head "$ANNOTATION" | tail -1 | awk '{print $1}' | grep chr) -fastaChr=$(head -1 "$FASTAPATH" | awk '{print $1}' | grep chr) -firstInputFile=$(echo "$INPUTFILES" | awk '{print $1}') -inputChr=$(tail -1 "$firstInputFile" | awk '{print $1}' | grep chr) -warningChr="Warning: it appears the provided gtf, fasta, and input files use inconsistent Chromosome nomenclature. Eg. \"chr1\" vs \"1\". This will likely cause issues. Please edit them for consistency" -if [ "$gtfChr" == "" ]; then +#check input gtf/fasta "chr" nomenclature +gtfChr=$(zcat -f "$ANNOTATION" | grep -v '^GL000' | tail -1 | awk '{print $1}' | grep chr) +fastaChr=$(cat "$FASTAPATH" | head -1 | awk '{print $1}' | grep chr) + gtfChrAdd="" + gtfChrRemove="UnmatchedString" if [ "$fastaChr" != "" ]; then - echo "$warningChr" - elif [ "$inputChr" != "" ]; then - echo "$warningChr" + if [ "$gtfChr" == "" ]; then + gtfChrAdd="chr" + fi + elif [ "$fastaChr" == "" ]; then + if [ "$gtfChr" != "" ]; then + gtfChrRemove="chr" + fi fi -else - if [ "$fastaChr" == "" ]; then - echo "$warningChr" - elif [ "$inputChr" == "" ]; then - echo "$warningChr" - fi -fi #prepare splice site intervals from annotation.gtf gtfBasename=$(basename $ANNOTATION) if [ ! -f data/"$gtfBasename"_SpliceSiteIntervals.txt ] || [[ "$ANNOTATION" -nt data/"$gtfBasename"_SpliceSiteIntervals.txt ]] ; then echo "Preparing splice site annotation..." - grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' "$ANNOTATION" | grep -v '^GL000' | - java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt + zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' | grep -v '^GL000' | + sed "s/$gtfChrRemove//" | awk -v var="$gtfChrAdd" '{print var$0}' | java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt fi #for each input VCF/BED file for FILE in $INPUTFILES; do @@ -139,9 +133,39 @@ for FILE in $INPUTFILES; do echo "Input file: $fileID" #remove temp files from any previous run rm temp/"$fileID"* 2> /dev/null + #check input file type + FILETYPE="" + nFields=$(zcat -f $FILE | tail -1 | wc -w) + vcfHeader=$(zcat -f $FILE | head -1 | grep VCF) + if [ "$nFields" -eq 4 ]; then + FILETYPE="TSV" + elif [ ! -z "$vcfHeader" ]; then + FILETYPE="VCF" + else + FILETYPE="BED" + fi + echo "File type: $FILETYPE" + #correct mismatches in "chr" nomenclature among input files + inputChr=$(zcat -f "$FILE" | tail -1 | awk '{print $1}' | grep chr) + inputChrAdd="" + inputChrRemove="UnmatchedString" + if [ "$fastaChr" != "" ]; then + if [ "$inputChr" == "" ]; then + inputChrAdd="chr" + fi + elif [ "$fastaChr" == "" ]; then + if [ "$inputChr" != "" ]; then + inputChrRemove="chr" + fi + fi #sort body of input file - grep "^#" "$FILE" > temp/"$fileID"_sorted - grep -v "^#" "$FILE" | sort -k1,1 -k2,2n >> temp/"$fileID"_sorted + zcat -f "$FILE" | grep "^#" > temp/"$fileID"_sorted + if [ "$FILETYPE" == "TSV" ]; then + zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$1, $2, $2, "x", "1", ".", $3, $4}' >> temp/"$fileID"_sorted + else + #zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | sed "/^/$inputChrAdd/" >> temp/"$fileID"_sorted + zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$0}' >> temp/"$fileID"_sorted + fi #check bedtools is installed bedtoolsLocation=$(which bedtools); if [ "$bedtoolsLocation" == "" ]; then @@ -157,7 +181,7 @@ for FILE in $INPUTFILES; do fi #bedtools intersect to get strand info echo "Retrieving strand info..." - grep '[[:blank:]]gene[[:blank:]]' "$ANNOTATION" | sort -k1,1 -k4,4n | grep -v '^GL000' | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted > temp/"$fileID"unstrandedInput.txt + zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]' | sort -k1,1 -k4,4n | grep -v '^GL000' | awk -v var="$gtfChrAdd" -v OFS="\\t" '{print var$0}' | sed "s/$gtfChrRemove//" | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted > temp/"$fileID"unstrandedInput.txt if [ $? -ne 0 ]; then echo "Warning. Bedtools intersect returned non-zero exit status. Intersection failed between provided variant VCF/BED file and provided GTF. See above error message for more details" fi @@ -166,10 +190,10 @@ for FILE in $INPUTFILES; do exit 1 fi #generate flanking intervals.bed for bedtools getfasta and branchpointer input - if [ "$INPUTVCF" = "TRUE" ]; then + if [ "$FILETYPE" = "VCF" ]; then grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $4, $5}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $4, $5}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed - elif [ "$INPUTBED" = "TRUE" ]; then + elif [ "$FILETYPE" = "TSV" ] || [ "$FILETYPE" = "BED" ] ; then grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $7, $8}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $7, $8}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed fi @@ -185,25 +209,42 @@ for FILE in $INPUTFILES; do fi #seqScan: generates input strings for maxentscan and genesplicer as well as ESRseq scores echo "Scanning for motifs..." - rm output/mesOmmitted/"$fileID" 2> /dev/null + rm output/"$fileID"mesOmmitted.txt 2> /dev/null + rm output/"$fileID"refMismatch.txt 2> /dev/null java -cp bin seqScan temp/"$fileID"seqToScan.FASTA -useESR $fileID 1>&2 + if [ -s output/"$fileID"refMismatch.txt ]; then + refMismatchCount=$(wc -l output/"$fileID"refMismatch.txt | awk '{print $1}') + echo "Note: $refMismatchCount variants were excluded because the provided Reference allele does not match the nucleotide(s) in the provided FASTA. IDs of excluded variant(s) are outputted here: Spliceogen/output/""$fileID""refMismatch.txt" + fi + if [ -s output/"$fileID"mesOmmitted.txt ]; then + mesOmmittedCount=$(wc -l output/"$fileID"mesOmmitted.txt | awk '{print $1}') + echo "Note: $mesOmmittedCount variants were excluded from MaxEntScan because their flanking FASTA sequence contains invalid characters (most commonly \"n\"), which cannot be processed by MaxEntScan. IDs of ommitted variant(s) are listed in: Spliceogen/output/""$fileID""mesOmmitted.txt" + fi #run maxEntScan and confirm non-zero exit, since invalid inputs cause it to exit early - echo "Running MaxEntScan..." - perl score5.pl temp/"$fileID"mesDonorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt - retVal=( ${PIPESTATUS[0]} ) - if [ $retVal -ne 0 ]; then - echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..." - exit $retVal - fi - perl score3.pl temp/"$fileID"mesAcceptorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt - retVal=( ${PIPESTATUS[0]} ) - if [ $retVal -ne 0 ]; then - echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..." - exit $retVal + if [ -s temp/"$fileID"mesDonorInput.txt ] || [ -s temp/"$fileID"mesAcceptorInput.txt ] ; then + echo "Running MaxEntScan..." + perl score5.pl temp/"$fileID"mesDonorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt + retVal=( ${PIPESTATUS[0]} ) + if [ $retVal -ne 0 ]; then + echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..." + exit $retVal + fi + perl score3.pl temp/"$fileID"mesAcceptorInput.txt | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt + retVal=( ${PIPESTATUS[0]} ) + if [ $retVal -ne 0 ]; then + echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..." + exit $retVal + fi + else + echo "No input for MaxEntScan" fi #run genesplicer - echo "Running GeneSplicer..." - bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt + if [ -s temp/"$fileID"gsInput.FASTA ] ; then + echo "Running GeneSplicer..." + bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt + else + echo "No input for GeneSplicer" + fi #run branchpointer SNPs if [ "$USEBP" = "TRUE" -a "$USEBPINDELS" = "FALSE" ]; then echo "Running Branchpointer..." @@ -226,25 +267,35 @@ for FILE in $INPUTFILES; do #awk -v OFS=\\t '{print $2, $3, $4, $8, $9, $15, $16, $22, $23, $24, $25}' output/"$fileID"bpOutputSNPs.txt" > output/bpSNPsSummarised.txt fi #merge scores into one line - echo "Processing scores..." - cat temp/"$fileID"mesDonorScores.txt temp/"$fileID"mesAcceptorScores.txt temp/"$fileID"gsScores.txt temp/"$fileID"ESRoutput.txt data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt | - sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID" - #sort predictions - if [ -s temp/"$fileID"_donorCreating_unsorted.txt ]; then - sort -gr -k11,11 temp/"$fileID"_donorCreating_unsorted.txt >> output/"$fileID"_donorCreating.txt - else - rm output/"$fileID"_donorCreating.txt - fi - if [ -s temp/"$fileID"_acceptorCreating_unsorted.txt ]; then - sort -gr -k11,11 temp/"$fileID"_acceptorCreating_unsorted.txt >> output/"$fileID"_acceptorCreating.txt + scoresToMerge="" + if [ -s temp/"$fileID"gsScores.txt ] ; then + scoresToMerge="temp/"$fileID"gsScores.txt" + fi + if [ -s temp/"$fileID"mesDonorScores.txt ] ; then + scoresToMerge="$scoresToMerge temp/"$fileID"mesDonorScores.txt" + fi + if [ -s temp/"$fileID"mesAcceptorScores.txt ] ; then + scoresToMerge="$scoresToMerge temp/"$fileID"mesAcceptorScores.txt" + fi + if [ -s temp/"$fileID"ESRoutput.txt ] ; then + scoresToMerge="$scoresToMerge temp/"$fileID"ESRoutput.txt" + fi + checkScoresExist=$(echo "$scoresToMerge" | grep "temp") + if [ -z "$checkScoresExist" ]; then + echo "No MaxEntScan/GeneSplicer/ESRseq scores to process" else - rm output/"$fileID"_acceptorCreating.txt + echo "Processing scores..." + cat $(echo "$scoresToMerge") data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt | sort -k1,1 -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID" + fi + #sort predictions + if [ -s temp/"$fileID"_gain_unsorted.txt ]; then + echo -e "#CHR\tSTART\tREF\tALT\tGENE\tdonGainP\taccGainP" > output/"$fileID"_ssGain.txt + sort -gr -k8,8 temp/"$fileID"_gain_unsorted.txt | cut -f1-7 >> output/"$fileID"_ssGain.txt fi - if [ -s temp/"$fileID"_withinSS_unsorted.txt ]; then - sort -gr -k17,17 temp/"$fileID"_withinSS_unsorted.txt | cut -f1-15 >> output/"$fileID"_withinSS.txt - else - rm output/"$fileID"_withinSS.txt + if [ -s temp/"$fileID"_loss_unsorted.txt ]; then + echo -e "#CHR\tSTART\tREF\tALT\tGENE\twithinSS\tdonGainP\taccGainP" > output/"$fileID"_withinSS.txt + sort -gr -k9,9 temp/"$fileID"_loss_unsorted.txt | cut -f1-8 >> output/"$fileID"_withinSS.txt fi #clean up temp files - rm temp/"$fileID"* 2> /dev/null + #rm temp/"$fileID"* 2> /dev/null done diff --git a/bin/mergeOutput.class b/bin/mergeOutput.class index 79c45b5..8a12440 100644 Binary files a/bin/mergeOutput.class and b/bin/mergeOutput.class differ diff --git a/bin/mergeOutput.java b/bin/mergeOutput.java index 585c66a..ec17cbe 100644 --- a/bin/mergeOutput.java +++ b/bin/mergeOutput.java @@ -6,19 +6,17 @@ import java.text.DecimalFormat; public class mergeOutput { - //store output lines in buffers to minimise I/O + //store output lines in buffers to minimise I/O public static String[] avBuffer = new String[30000]; - public static String[] donBuffer = new String[30000]; - public static String[] accBuffer = new String[30000]; + public static String[] gainBuffer = new String[30000]; public static String[] ssBuffer = new String[30000]; + public static int gainBufferIndex = 0; public static int avBufferIndex = 0; - public static int donBufferIndex = 0; - public static int accBufferIndex = 0; public static int ssBufferIndex = 0; public static void main (String[] args) { if (args.length < 1) { - System.out.println("must provide vcf/bed file name as command line argument"); + System.out.println("Must provide input filename arg...exiting"); System.exit(1); } String fileName=args[0]; @@ -43,238 +41,317 @@ public static void main (String[] args) { try { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); while ((s = in.readLine()) != null && s.length() != 0) { - String[] split = s.split("\\s+"); - String chr = split[0]; - int start = Integer.parseInt(split[1]); - String ref = split[2].toUpperCase(); - String alt = split[3].toUpperCase(); - String type = split[4]; - //check if current line is next variant - boolean nextVariant = false; - if (!(chr.equals(prevID[0])&&start==Integer.parseInt(prevID[1]))) { - nextVariant = true; - } else - if (!(ref.equals(prevID[2])&&alt.equals(prevID[3]))) { - nextVariant = true; - } else - if (chr.equals("xxx")) { - nextVariant = true; - } - //print scores of previous variant, if current line is next variant and previous line wasn't a gene annotation - if(nextVariant && !prevID[4].equals("GENE")) { - //update overlapping genes and splice sites - geneID = updateOverlappingGenes(geneID, prevID); - String withinSS = checkForOverlappingSpliceSite(prevID, donNames, accNames, donStart, accStart, donEnd, accEnd); - //format output columns - String[] out = new String[21]; - out[0] = prevID[0]; out[1] = Integer.toString(Integer.parseInt(prevID[1])); - out[3] = prevID[2]; out[4] = prevID[3]; out[5] = geneID[1]; out[6] = withinSS; - //adjust end value for indels - int endPos = Integer.parseInt(prevID[1])+prevID[2].length()-1; - if (prevID[3].equals("*")) { - endPos++; - } - out[2]= Integer.toString(endPos); - //fill scores - for (int i=0; i<12; i++) { - if (scores[i]==-99 || scores[i]==0) { - out[i+7]="."; - } else { - out[i+7]=Double.toString(scores[i]); + String[] split = s.split("\\s+"); + String chr = split[0]; + int start = Integer.parseInt(split[1]); + String ref = split[2].toUpperCase(); + String alt = split[3].toUpperCase(); + String type = split[4]; + //check if current line is next variant + boolean nextVariant = false; + if (!(chr.equals(prevID[0])&&start==Integer.parseInt(prevID[1]))) { + nextVariant = true; + } else + if (!(ref.equals(prevID[2])&&alt.equals(prevID[3]))) { + nextVariant = true; + } else + if (chr.equals("xxx")) { + nextVariant = true; + } + //print scores of previous variant, if current line is next variant and previous line wasn't a gene annotation + if(nextVariant && !prevID[4].equals("GENE")) { + //update overlapping genes and splice sites + geneID = updateOverlappingGenes(geneID, prevID); + String withinSS = checkForOverlappingSpliceSite(prevID, donNames, accNames, donStart, accStart, donEnd, accEnd); + //format output columns + String[] out = new String[23]; + out[0] = prevID[0]; out[1] = Integer.toString(Integer.parseInt(prevID[1])); + out[3] = prevID[2]; out[4] = prevID[3]; out[5] = geneID[1]; out[6] = withinSS; + //adjust end value for indels + int endPos = Integer.parseInt(prevID[1])+prevID[2].length()-1; + if (prevID[3].equals("*")) { + endPos++; } - } - //calculate donor/acceptor creating logistic regression scores - String[] lrScores = calculateLogRegScores(scores).split("\\s+"); - if (!out[6].contains("ENSE")) { - out[19]= lrScores[0]; + out[2]= Integer.toString(endPos); + //fill scores + for (int i=0; i<12; i++) { + if (scores[i]==-99 || scores[i]==0) { + out[i+7]="."; + } else { + out[i+7]=Double.toString(scores[i]); + } + } + //calculate donor/acceptor creating logistic regression scores + String[] lrScores = calculateLogRegScores(scores, out[6]).split("\\s+"); + out[19]= lrScores[0]; out[20]= lrScores[1]; - } else { - out[19]= "."; - out[20]= "."; + out[21]= lrScores[2]; + out[22]= lrScores[3]; + + //add output line to file buffers + outputVariantToBuffers(out); + //reset scores + Arrays.fill(scores, -99.0); + //append to file and empty buffers every ~25000 lines + if (avBufferIndex > 25000) { + appendToFiles(fileName); + resetOutputArrays(); + } } - //add output line to file buffers - outputVariantToBuffers(out); - //reset scores - Arrays.fill(scores, -99.0); - //append to file and empty buffers every ~25000 lines - if (avBufferIndex > 25000) { - appendToFiles(fileName); - resetOutputArrays(); - } - } - //MaxEntScan - if (type.equals("MESDON")) { - scores[0]=Double.parseDouble(split[5]); - scores[1]=Double.parseDouble(split[6]); - } else - if(type.equals("MESACC")){ - scores[2]=Double.parseDouble(split[5]); - scores[3]=Double.parseDouble(split[6]); - } else - //GeneSplicer - if (type.equals("GSREF")) { - if (split[6].equals("donor")) { - scores[4]=Double.parseDouble(split[5]); - } else - if (split[6].equals("acceptor")) { - scores[6]=Double.parseDouble(split[5]); - } - } else - if (type.equals("GSALT")) { - if (split[6].equals("donor")) { - scores[5]=Double.parseDouble(split[5]); - } else - if (split[6].equals("acceptor")) { - scores[7]=Double.parseDouble(split[5]); - } - } else - //ESR - if (type.equals("ESR")) { - scores[8]=Double.parseDouble(split[5]); - scores[9]=Double.parseDouble(split[6]); - scores[10]=Double.parseDouble(split[7]); - scores[11]=Double.parseDouble(split[8]); - } - //GENE - if (type.equals("GENE")) { - //overlapping gene annotations - if (ifGenesOverlap(geneID, chr, start)) { - geneID[1] = geneID[1].concat(";").concat(split[3]); - geneID[2] = geneID[2].concat(";").concat(split[2]); - geneID[0] = split[0]; - if (split.length>11) { - //update splice site pos/name arrays with info from new overlapping gene - String[] donStartStr=split[6].split(","); - String[] donEndStr=split[7].split(","); - String[] accStartStr=split[8].split(","); - String[] accEndStr=split[9].split(","); - String[] donNamesStr=split[10].split(","); - String[] accNamesStr=split[11].split(","); - int accIndex = 0; - int donIndex = 0; - while (donStart[donIndex]>0) { - donIndex++; - } - while (accStart[accIndex]>0) { - accIndex++; - } - for (int k = 0; k < donStartStr.length; k++) { - donStart[donIndex+k] = Integer.parseInt(donStartStr[k]); - donEnd[donIndex+k] = Integer.parseInt(donEndStr[k]); - donNames[donIndex+k] = donNamesStr[k]; - } - for (int k = 0; k < accStartStr.length; k++) { - accStart[accIndex+k] = Integer.parseInt(accStartStr[k]); - accEnd[accIndex+k] = Integer.parseInt(accEndStr[k]); - accNames[accIndex+k] = accNamesStr[k]; + //MaxEntScan + if (type.equals("MESDON")) { + scores[0]=Double.parseDouble(split[5]); + scores[1]=Double.parseDouble(split[6]); + } else + if(type.equals("MESACC")){ + scores[2]=Double.parseDouble(split[5]); + scores[3]=Double.parseDouble(split[6]); + } else + //GeneSplicer + if (type.equals("GSREF")) { + if (split[6].equals("donor")) { + scores[4]=Double.parseDouble(split[5]); + } else + if (split[6].equals("acceptor")) { + scores[6]=Double.parseDouble(split[5]); + } + } else + if (type.equals("GSALT")) { + if (split[6].equals("donor")) { + scores[5]=Double.parseDouble(split[5]); + } else + if (split[6].equals("acceptor")) { + scores[7]=Double.parseDouble(split[5]); + } + } else + //ESR + if (type.equals("ESR")) { + scores[8]=Double.parseDouble(split[5]); + scores[9]=Double.parseDouble(split[6]); + scores[10]=Double.parseDouble(split[7]); + scores[11]=Double.parseDouble(split[8]); + } + //GENE + if (type.equals("GENE")) { + //overlapping gene annotations + if (ifGenesOverlap(geneID, chr, start)) { + geneID[1] = geneID[1].concat(";").concat(split[3]); + geneID[2] = geneID[2].concat(";").concat(split[2]); + geneID[0] = split[0]; + if (split.length>11) { + //update splice site pos/name arrays with info from new overlapping gene + String[] donStartStr=split[6].split(","); + String[] donEndStr=split[7].split(","); + String[] accStartStr=split[8].split(","); + String[] accEndStr=split[9].split(","); + String[] donNamesStr=split[10].split(","); + String[] accNamesStr=split[11].split(","); + int accIndex = 0; + int donIndex = 0; + while (donStart[donIndex]>0) { + donIndex++; + } + while (accStart[accIndex]>0) { + accIndex++; + } + for (int k = 0; k < donStartStr.length; k++) { + donStart[donIndex+k] = Integer.parseInt(donStartStr[k]); + donEnd[donIndex+k] = Integer.parseInt(donEndStr[k]); + donNames[donIndex+k] = donNamesStr[k]; + } + for (int k = 0; k < accStartStr.length; k++) { + accStart[accIndex+k] = Integer.parseInt(accStartStr[k]); + accEnd[accIndex+k] = Integer.parseInt(accEndStr[k]); + accNames[accIndex+k] = accNamesStr[k]; + } + } + //non-overlapping genes + } else { + //reset arrays + Arrays.fill(donStart, 0); + Arrays.fill(donEnd, 0); + Arrays.fill(accStart, 0); + Arrays.fill(accEnd, 0); + Arrays.fill(donNames, null); + Arrays.fill(accNames, null); + geneID[1] = split[3]; + geneID[2] = split[2]; + geneID[0] = split[0]; + if (split.length>11) { + //reset and update splice site pos/name arrays + String[] donStartStr=split[6].split(","); + String[] donEndStr=split[7].split(","); + String[] accStartStr=split[8].split(","); + String[] accEndStr=split[9].split(","); + String[] donNamesStr=split[10].split(","); + String[] accNamesStr=split[11].split(","); + for (int k=0; k11) { - //reset and update splice site pos/name arrays - String[] donStartStr=split[6].split(","); - String[] donEndStr=split[7].split(","); - String[] accStartStr=split[8].split(","); - String[] accEndStr=split[9].split(","); - String[] donNamesStr=split[10].split(","); - String[] accNamesStr=split[11].split(","); - for (int k=0; k 0) { + appendToFiles(fileName); + } + } catch (Exception e) { + e.printStackTrace(); } - //final append to file - if (avBufferIndex > 0) { - appendToFiles(fileName); - } -} catch (Exception e) { - e.printStackTrace(); + } + /* + public static String calculateLogRegScores (double[] s) { +//impute missing values +//maxEntScan +for (int i=0; i<4; i++) { +if (s[i]==-99.0 | s[i]==0.0) { +s[i] = -20.0; +} +} +//gsDonRef +if (s[4]==-99.0 | s[4]==0.0) { +if (s[5]==-99.0 | s[5]==0.0) { +s[4]=0.0; +s[5]=0.0; +} else { +s[4]=-3.0; } +} else +//gsDonAlt +if (s[7]==-99.0 | s[7]==0.0) { +s[7]=-3.0; +} +//gsAccRef +if (s[6]==-99.0 | s[6]==0.0) { +if (s[7]==-99.0 | s[7]==0.0) { +s[6]=0.0; +s[7]=0.0; +} else { +s[6]=-3.0; +} +} else +//gsAccAlt +if (s[7]==-99.0 | s[7]==0.0) { +s[7]=-3.0; +} +double mesDonChange = s[1] - s[0]; +double mesAccChange = s[3] - s[2]; +double gsDonChange = s[5] - s[4]; +double gsAccChange = s[7] - s[6]; +// calculate p = 1/(1+e^-(a + b1X1 + b2X2 + ... + bnXn)) +double pDon = 1/(1 + Math.exp(-(-0.9865 + (mesDonChange * 0.1129) + (gsDonChange * 0.01151) + (s[1] * 0.2076) + (s[5] * 0.4350) ))); +double pAcc = 1/(1 + Math.exp(-(-1.665 + (mesAccChange * 0.3323) + (gsAccChange * 0.05084) + (s[3] * 0.1877) + (s[7] * 0.1730) ))); +String pDonStr = Double.toString(pDon); +String pAccStr = Double.toString(pAcc); +//round scores to 2 decimal places +try { +pDonStr = String.format("%.02f",pDon); +pAccStr = String.format("%.02f",pAcc); +} catch (Exception e) {} +String ret = pDonStr + "\t" + pAccStr; +return ret; +} + */ + +public static String calculateLogRegScores (double[] s, String withinSS) { + //impute missing values + //maxEntScan + for (int i=0; i<4; i++) { + if (s[i]==-99.0 | s[i]==0.0) { + s[i] = -20.0; + } } - - public static String calculateLogRegScores (double[] s) { - //impute missing values - //maxEntScan - for (int i=0; i<4; i++) { - if (s[i]==-99.0 | s[i]==0.0) { - s[i] = -20.0; - } - } - //gsDonRef - if (s[4]==-99.0 | s[4]==0.0) { - if (s[5]==-99.0 | s[5]==0.0) { - s[4]=0.0; - s[5]=0.0; - } else { - s[4]=-3.0; - } - } else + //gsDonRef + if (s[4]==-99.0 | s[4]==0.0) { + if (s[5]==-99.0 | s[5]==0.0) { + s[4]=0.0; + s[5]=0.0; + } else { + s[4]=-3.0; + } + } else //gsDonAlt if (s[7]==-99.0 | s[7]==0.0) { s[7]=-3.0; } - //gsAccRef - if (s[6]==-99.0 | s[6]==0.0) { - if (s[7]==-99.0 | s[7]==0.0) { - s[6]=0.0; - s[7]=0.0; - } else { - s[6]=-3.0; - } - } else + //gsAccRef + if (s[6]==-99.0 | s[6]==0.0) { + if (s[7]==-99.0 | s[7]==0.0) { + s[6]=0.0; + s[7]=0.0; + } else { + s[6]=-3.0; + } + } else //gsAccAlt if (s[7]==-99.0 | s[7]==0.0) { s[7]=-3.0; } - double mesDonChange = s[1] - s[0]; - double mesAccChange = s[3] - s[2]; - double gsDonChange = s[5] - s[4]; - double gsAccChange = s[7] - s[6]; - // calculate p = 1/(1+e^-(a + b1X1 + b2X2 + ... + bnXn)) - double pDon = 1/(1 + Math.exp(-(-0.9865 + (mesDonChange * 0.1129) + (gsDonChange * 0.01151) + (s[1] * 0.2076) + (s[5] * 0.4350) ))); - double pAcc = 1/(1 + Math.exp(-(-1.665 + (mesAccChange * 0.3323) + (gsAccChange * 0.05084) + (s[3] * 0.1877) + (s[7] * 0.1730) ))); - String pDonStr = Double.toString(pDon); - String pAccStr = Double.toString(pAcc); - //round scores to 2 decimal places - try { - pDonStr = String.format("%.02f",pDon); - pAccStr = String.format("%.02f",pAcc); - } catch (Exception e) {} - String ret = pDonStr + "\t" + pAccStr; - return ret; + double mesDonChange = s[1] - s[0]; + double mesAccChange = s[3] - s[2]; + double gsDonChange = s[5] - s[4]; + double gsAccChange = s[7] - s[6]; + double pDonGain = -1; double pAccGain = -1; double pDonLoss = -1; double pAccLoss = -1; + // using within/outside splice site logistic regression models, calculate p = 1/(1+e^-(a + b1X1 + b2X2 + ... + bnXn)) + if (!withinSS.contains("ENSE")) { + pDonGain = 1/(1 + Math.exp(-(-0.9865 + (mesDonChange * 0.1129) + (gsDonChange * 0.01151) + (s[1] * 0.2076) + (s[5] * 0.4350) ))); + pAccGain = 1/(1 + Math.exp(-(-1.665 + (mesAccChange * 0.3323) + (gsAccChange * 0.05084) + (s[3] * 0.1877) + (s[7] * 0.1730) ))); + } + if (withinSS.contains("donor")){ + //pDonLoss = 1/(1 + Math.exp(-(-1.6678 + (mesDonChange * -0.6752) ))); + pDonLoss = 1/(1 + Math.exp(-(-1.1028 + (mesDonChange * -0.4240) ))); + } + if (withinSS.contains("acceptor")){ + //pAccLoss = 1/(1 + Math.exp(-(-0.6209 + (mesAccChange * -0.5100) ))); + pAccLoss = 1/(1 + Math.exp(-(-0.2355 + (mesAccChange * -0.2283) ))); + } + String pDonGainStr = Double.toString(pDonGain); + String pAccGainStr = Double.toString(pAccGain); + String pDonLossStr = Double.toString(pDonLoss); + String pAccLossStr = Double.toString(pAccLoss); + //round scores to 2 decimal places + try { + pDonGainStr = String.format("%.02f",pDonGain); + pAccGainStr = String.format("%.02f",pAccGain); + pDonLossStr = String.format("%.02f",pDonLoss); + pAccLossStr = String.format("%.02f",pAccLoss); + } catch (Exception e) { + //use unrounded score + } + if (pDonGainStr.equals("-1.00")) { + pDonGainStr = "."; + } + if (pAccGainStr.equals("-1.00")) { + pAccGainStr = "."; } + if (pDonLossStr.equals("-1.00")) { + pDonLossStr = "."; + } + if (pAccLossStr.equals("-1.00")) { + pAccLossStr = "."; + } + String ret = pDonGainStr + "\t" + pAccGainStr + "\t" + pDonLossStr + "\t" + pAccLossStr; + return ret; +} public static void appendToFiles (String fileName) { String annovar_file = "output/"+fileName+"_out.txt"; - String acc_fileUnsorted = "temp/"+fileName+"_acceptorCreating_unsorted.txt"; - String don_fileUnsorted = "temp/"+fileName+"_donorCreating_unsorted.txt"; - String ss_fileUnsorted = "temp/"+fileName+"_withinSS_unsorted.txt"; + String gain_file_unsorted = "temp/"+fileName+"_gain_unsorted.txt"; + String ss_file_unsorted = "temp/"+fileName+"_loss_unsorted.txt"; try { //write to _out FileWriter fwAV = new FileWriter(annovar_file, true); @@ -283,22 +360,15 @@ public static void appendToFiles (String fileName) { avWriter.write(avBuffer[i]+"\n"); } avWriter.close(); - //write to _donorCreating - FileWriter fwDon = new FileWriter(don_fileUnsorted, true); - BufferedWriter donWriter = new BufferedWriter(fwDon); - for (int i=0; i=donStart[i]&&startPos<=donEnd[i]) { if (!withinSS.equals("")) { - withinSS=withinSS.concat(","); + withinSS=withinSS.concat(","); } - withinSS = withinSS.concat(donNames[i]).concat("_donor"); + withinSS = withinSS.concat(donNames[i]).concat("_donor"); } } } @@ -413,98 +471,91 @@ public static String checkForOverlappingSpliceSite( String[] prevID, String[] do withinSS="."; } return withinSS; - } - - public static void outputVariantToBuffers(String[] out) { +} + +public static void outputVariantToBuffers(String[] out) { //write full line for annovar String avLine = ""; - for (int i=0; i<20; i++) { + for (int i=0; i<22; i++) { avLine = avLine+out[i]+"\t"; } - avLine = avLine+out[20]; + avLine = avLine+out[22]; avBuffer[avBufferIndex] = avLine; avBufferIndex++; //write withinSS if (out[6].contains("ENSE")) { - String ssLine = ""; - for (int i=0; i<16; i++) { - ssLine = ssLine+out[i]+"\t"; - } - double maxScoreDecrease = -99; - //if donorSS - if (out[6].contains("_donor")) { - if (!out[7].equals(".") && !out[8].equals(".")) { - maxScoreDecrease = Double.parseDouble(out[7]) - Double.parseDouble(out[8]); + String ssLine = out[0]+"\t"+out[1]+"\t"+out[3]+"\t"+out[4]+"\t"+out[5]+"\t"+out[6]+"\t"+out[21]+"\t"+out[22]; + Double lossMax = 0.00 ; + try { + if (!out[21].equals(".")) { + lossMax = Double.parseDouble(out[21]); } - } - //if acceptorSS - if (out[6].contains("_acceptor")) { - if (!out[9].equals(".") && !out[10].equals(".")) { - if (maxScoreDecrease < Double.parseDouble(out[9]) - Double.parseDouble(out[10]) ) { - maxScoreDecrease = Double.parseDouble(out[9]) - Double.parseDouble(out[10]); + if (!out[22].equals(".")) { + if (lossMax < Double.parseDouble(out[22])) { + lossMax = Double.parseDouble(out[22]); } } - } - ssLine = ssLine+maxScoreDecrease; + } catch (Exception e) {} + ssLine = ssLine+"\t"+lossMax; ssBuffer[ssBufferIndex] = ssLine; ssBufferIndex++; } else { - //write donCreating - if (Double.parseDouble(out[19])>=0.7) { - String donLine = ""; - for (int i=0; i<6; i++) { - donLine= donLine+out[i]+"\t"; + //write gain + String gainLine = out[0]+"\t"+out[1]+"\t"+out[3]+"\t"+out[4]+"\t"+out[5]+"\t"+out[19]+"\t"+out[20]; + Double gainMax = 0.00 ; + try { + if (!out[19].equals(".")) { + gainMax = Double.parseDouble(out[19]); } - donLine = donLine+out[7]+"\t"+out[8]+"\t"+out[11]+"\t"+out[12]+"\t"+out[19]; - donBuffer[donBufferIndex] = donLine; - donBufferIndex++; - } - //write accCreating - if (Double.parseDouble(out[20])>=0.7) { - String accLine = ""; - for (int i=0; i<6; i++) { - accLine= accLine+out[i]+"\t"; + if (!out[20].equals(".")) { + if (gainMax < Double.parseDouble(out[20])) { + gainMax = Double.parseDouble(out[20]); + } } - accLine = accLine+out[9]+"\t"+out[10]+"\t"+out[13]+"\t"+out[14]+"\t"+out[20]; - accBuffer[accBufferIndex] = accLine; - accBufferIndex++; + } catch (Exception e) {} + if (gainMax>=0.7) { + gainLine = gainLine+"\t"+Double.toString(gainMax); + gainBuffer[gainBufferIndex] = gainLine; + gainBufferIndex++; } } } - - public static boolean ifGenesOverlap(String[] geneID, String chr, int start) { - //check for overlapping genes - int overlapTest = 0; - //check individually for already overlapping genes - if (geneID[1].concat(geneID[2]).contains(";")) { - String[] geneEndSplit = geneID[2].split(";"); - String[] geneNameSplit = geneID[1].split(";"); - String updatedGeneName = ""; - String updatedGeneEnd = ""; - int geneIndex = Math.min(geneEndSplit.length, geneNameSplit.length); - for (int i=0; i= start) { - overlapTest = Integer.parseInt(geneEndSplit[i]); - if (!updatedGeneName.equals("")) { - updatedGeneName = updatedGeneName.concat(";"); - updatedGeneEnd = updatedGeneEnd.concat(";"); - } - updatedGeneName = updatedGeneName.concat(geneNameSplit[i]); - updatedGeneEnd = updatedGeneEnd.concat(geneEndSplit[i]); +// avWriter.write("#CHR\tSTART\tEND\tREF\tALT\tGENE\twithinSite\tmesDonRef\tmesDonAlt\tmesAccRef\tmesAccAlt\tgsDonRef\tgsDonAlt\tgsAccRef\tgsAccAlt\tESEmaxRef\tESEmaxAlt\tESSminRef\tESSminAlt\tdonGainP\taccGainP\tdonLossP\taccLossP"+"\n"); +// String ret = pDonGainStr + "\t" + pAccGainStr + "\t" + pDonLossStr + "\t" + pAccLossStr; + +public static boolean ifGenesOverlap(String[] geneID, String chr, int start) { + //check for overlapping genes + int overlapTest = 0; + //check individually for already overlapping genes + if (geneID[1].concat(geneID[2]).contains(";")) { + String[] geneEndSplit = geneID[2].split(";"); + String[] geneNameSplit = geneID[1].split(";"); + String updatedGeneName = ""; + String updatedGeneEnd = ""; + int geneIndex = Math.min(geneEndSplit.length, geneNameSplit.length); + for (int i=0; i= start) { + overlapTest = Integer.parseInt(geneEndSplit[i]); + if (!updatedGeneName.equals("")) { + updatedGeneName = updatedGeneName.concat(";"); + updatedGeneEnd = updatedGeneEnd.concat(";"); } + updatedGeneName = updatedGeneName.concat(geneNameSplit[i]); + updatedGeneEnd = updatedGeneEnd.concat(geneEndSplit[i]); } - geneID[1] = updatedGeneName; - geneID[2] = updatedGeneEnd; - } else if (geneID[2].equals(".")) { - overlapTest = 0; - geneID[2]="0"; - } else if (!geneID[2].equals("")) { - overlapTest = Integer.parseInt(geneID[2]); } - if (start <= overlapTest && geneID[0].equals(chr)) { - return true; - } - return false; - } + geneID[1] = updatedGeneName; + geneID[2] = updatedGeneEnd; + } else if (geneID[2].equals(".")) { + overlapTest = 0; + geneID[2]="0"; + } else if (!geneID[2].equals("")) { + overlapTest = Integer.parseInt(geneID[2]); + } + if (start <= overlapTest && geneID[0].equals(chr)) { + return true; + } + return false; +} } diff --git a/bin/mergeOutput2.java b/bin/mergeOutput2.java new file mode 100644 index 0000000..5e78649 --- /dev/null +++ b/bin/mergeOutput2.java @@ -0,0 +1,435 @@ +//Description: Merging GeneSplicer, MaxEntScan and ESRseq scores along with annotation information to be outputted together on one line per variant. +import java.io.*; +import java.util.Arrays; +import java.util.jar.Attributes.Name; +import java.lang.*; +import java.text.DecimalFormat; +public class mergeOutput2 { + + //store output lines in buffers to minimise I/O + public static String[] avBuffer = new String[30000]; + public static int avBufferIndex = 0; + + public static void main (String[] args) { + if (args.length < 1) { + System.out.println("must provide vcf/bed file name as command line argument"); + System.exit(1); + } + String fileName=args[0]; + writeHeaders(fileName); + //initialise score tracking variables + int[] donStart = new int[10000]; + int[] donEnd = new int[10000]; + int[] accStart = new int[10000]; + int[] accEnd = new int[10000]; + String[] donNames = new String[10000]; + String[] accNames = new String[10000]; + //Note array elements: + //scores[]...mesDonRef(0);mesDonAlt(1);mesAccRef(2);mesAccAlt(3);gsDonRef(4);gsDonAlt(5);gsAccRef(6);gsAccAlt(7);ESEmaxRef(8);ESEmaxAlt(9);ESSminRef(10);ESSminAlt(11); + //geneID[]...chr(0), name(1), end(2) + //prevID[]...chr(0), start(1), ref(2), alt(3), type(4) + double[] scores = new double[12]; + Arrays.fill(scores, -99.0); + String[] geneID = { "", ".", ""}; + String[] prevID = { "", "-99", "", "", "GENE"}; + String s = ""; + //process lines + try { + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + while ((s = in.readLine()) != null && s.length() != 0) { + String[] split = s.split("\\s+"); + String chr = split[0]; + int start = Integer.parseInt(split[1]); + String ref = split[2].toUpperCase(); + String alt = split[3].toUpperCase(); + String type = split[4]; + //check if current line is next variant + boolean nextVariant = false; + if (!(chr.equals(prevID[0])&&start==Integer.parseInt(prevID[1]))) { + nextVariant = true; + } else + if (!(ref.equals(prevID[2])&&alt.equals(prevID[3]))) { + nextVariant = true; + } else + if (chr.equals("xxx")) { + nextVariant = true; + } + //print scores of previous variant, if current line is next variant and previous line wasn't a gene annotation + if(nextVariant && !prevID[4].equals("GENE")) { + //update overlapping genes and splice sites + geneID = updateOverlappingGenes(geneID, prevID); + String withinSS = checkForOverlappingSpliceSite(prevID, donNames, accNames, donStart, accStart, donEnd, accEnd); + //format output columns + String[] out = new String[23]; + out[0] = prevID[0]; out[1] = Integer.toString(Integer.parseInt(prevID[1])); + out[3] = prevID[2]; out[4] = prevID[3]; out[5] = geneID[1]; out[6] = withinSS; + //adjust end value for indels + int endPos = Integer.parseInt(prevID[1])+prevID[2].length()-1; + if (prevID[3].equals("*")) { + endPos++; + } + out[2]= Integer.toString(endPos); + //fill scores + for (int i=0; i<12; i++) { + if (scores[i]==-99 || scores[i]==0) { + out[i+7]="."; + } else { + out[i+7]=Double.toString(scores[i]); + } + } + //calculate donor/acceptor creating logistic regression scores + String[] lrScores = calculateLogRegScores(scores, out[6]).split("\\s+"); + out[19]= lrScores[0]; + out[20]= lrScores[1]; + out[21]= lrScores[2]; + out[22]= lrScores[3]; + //add output line to file buffers + outputVariantToBuffers(out); + //reset scores + Arrays.fill(scores, -99.0); + //append to file and empty buffers every ~25000 lines + if (avBufferIndex > 25000) { + appendToFiles(fileName); + resetOutputArrays(); + } + } + //MaxEntScan + if (type.equals("MESDON")) { + scores[0]=Double.parseDouble(split[5]); + scores[1]=Double.parseDouble(split[6]); + } else + if(type.equals("MESACC")){ + scores[2]=Double.parseDouble(split[5]); + scores[3]=Double.parseDouble(split[6]); + } else + //GeneSplicer + if (type.equals("GSREF")) { + if (split[6].equals("donor")) { + scores[4]=Double.parseDouble(split[5]); + } else + if (split[6].equals("acceptor")) { + scores[6]=Double.parseDouble(split[5]); + } + } else + if (type.equals("GSALT")) { + if (split[6].equals("donor")) { + scores[5]=Double.parseDouble(split[5]); + } else + if (split[6].equals("acceptor")) { + scores[7]=Double.parseDouble(split[5]); + } + } else + //ESR + if (type.equals("ESR")) { + scores[8]=Double.parseDouble(split[5]); + scores[9]=Double.parseDouble(split[6]); + scores[10]=Double.parseDouble(split[7]); + scores[11]=Double.parseDouble(split[8]); + } + //GENE + if (type.equals("GENE")) { + //overlapping gene annotations + if (ifGenesOverlap(geneID, chr, start)) { + geneID[1] = geneID[1].concat(";").concat(split[3]); + geneID[2] = geneID[2].concat(";").concat(split[2]); + geneID[0] = split[0]; + if (split.length>11) { + //update splice site pos/name arrays with info from new overlapping gene + String[] donStartStr=split[6].split(","); + String[] donEndStr=split[7].split(","); + String[] accStartStr=split[8].split(","); + String[] accEndStr=split[9].split(","); + String[] donNamesStr=split[10].split(","); + String[] accNamesStr=split[11].split(","); + int accIndex = 0; + int donIndex = 0; + while (donStart[donIndex]>0) { + donIndex++; + } + while (accStart[accIndex]>0) { + accIndex++; + } + for (int k = 0; k < donStartStr.length; k++) { + donStart[donIndex+k] = Integer.parseInt(donStartStr[k]); + donEnd[donIndex+k] = Integer.parseInt(donEndStr[k]); + donNames[donIndex+k] = donNamesStr[k]; + } + for (int k = 0; k < accStartStr.length; k++) { + accStart[accIndex+k] = Integer.parseInt(accStartStr[k]); + accEnd[accIndex+k] = Integer.parseInt(accEndStr[k]); + accNames[accIndex+k] = accNamesStr[k]; + } + } + //non-overlapping genes + } else { + //reset arrays + Arrays.fill(donStart, 0); + Arrays.fill(donEnd, 0); + Arrays.fill(accStart, 0); + Arrays.fill(accEnd, 0); + Arrays.fill(donNames, null); + Arrays.fill(accNames, null); + geneID[1] = split[3]; + geneID[2] = split[2]; + geneID[0] = split[0]; + if (split.length>11) { + //reset and update splice site pos/name arrays + String[] donStartStr=split[6].split(","); + String[] donEndStr=split[7].split(","); + String[] accStartStr=split[8].split(","); + String[] accEndStr=split[9].split(","); + String[] donNamesStr=split[10].split(","); + String[] accNamesStr=split[11].split(","); + for (int k=0; k 0) { + appendToFiles(fileName); + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + public static String calculateLogRegScores (double[] s, String withinSS) { + //impute missing values + //maxEntScan + for (int i=0; i<4; i++) { + if (s[i]==-99.0 | s[i]==0.0) { + s[i] = -20.0; + } + } + //gsDonRef + if (s[4]==-99.0 | s[4]==0.0) { + if (s[5]==-99.0 | s[5]==0.0) { + s[4]=0.0; + s[5]=0.0; + } else { + s[4]=-3.0; + } + } else + //gsDonAlt + if (s[7]==-99.0 | s[7]==0.0) { + s[7]=-3.0; + } + //gsAccRef + if (s[6]==-99.0 | s[6]==0.0) { + if (s[7]==-99.0 | s[7]==0.0) { + s[6]=0.0; + s[7]=0.0; + } else { + s[6]=-3.0; + } + } else + //gsAccAlt + if (s[7]==-99.0 | s[7]==0.0) { + s[7]=-3.0; + } + double mesDonChange = s[1] - s[0]; + double mesAccChange = s[3] - s[2]; + double gsDonChange = s[5] - s[4]; + double gsAccChange = s[7] - s[6]; + double pDonGain = -1; double pAccGain = -1; double pDonLoss = -1; double pAccLoss = -1; + // using within/outside splice site logistic regression models, calculate p = 1/(1+e^-(a + b1X1 + b2X2 + ... + bnXn)) + if (!withinSS.contains("ENSE")) { + pDonGain = 1/(1 + Math.exp(-(-0.9865 + (mesDonChange * 0.1129) + (gsDonChange * 0.01151) + (s[1] * 0.2076) + (s[5] * 0.4350) ))); + pAccGain = 1/(1 + Math.exp(-(-1.665 + (mesAccChange * 0.3323) + (gsAccChange * 0.05084) + (s[3] * 0.1877) + (s[7] * 0.1730) ))); + } + if (withinSS.contains("donor")){ + //pDonLoss = 1/(1 + Math.exp(-(-1.6678 + (mesDonChange * -0.6752) ))); + pDonLoss = 1/(1 + Math.exp(-(-1.1028 + (mesDonChange * -0.4240) ))); + } + if (withinSS.contains("acceptor")){ + //pAccLoss = 1/(1 + Math.exp(-(-0.6209 + (mesAccChange * -0.5100) ))); + pAccLoss = 1/(1 + Math.exp(-(-0.2355 + (mesAccChange * -0.2283) ))); + } + String pDonGainStr = Double.toString(pDonGain); + String pAccGainStr = Double.toString(pAccGain); + String pDonLossStr = Double.toString(pDonLoss); + String pAccLossStr = Double.toString(pAccLoss); + //round scores to 2 decimal places + try { + pDonGainStr = String.format("%.02f",pDonGain); + pAccGainStr = String.format("%.02f",pAccGain); + pDonLossStr = String.format("%.02f",pDonLoss); + pAccLossStr = String.format("%.02f",pAccLoss); + } catch (Exception e) { + //use unrounded score + } + if (pDonGainStr.equals("-1.00")) { + pDonGainStr = "."; + } + if (pAccGainStr.equals("-1.00")) { + pAccGainStr = "."; + } + if (pDonLossStr.equals("-1.00")) { + pDonLossStr = "."; + } + if (pAccLossStr.equals("-1.00")) { + pAccLossStr = "."; + } + String ret = pDonGainStr + "\t" + pAccGainStr + "\t" + pDonLossStr + "\t" + pAccLossStr; + return ret; + } + + public static void appendToFiles (String fileName) { + String annovar_file = "output/"+fileName+"_out.txt"; + try { + //write to _out + FileWriter fwAV = new FileWriter(annovar_file, true); + BufferedWriter avWriter = new BufferedWriter(fwAV); + for (int i=0; i= Integer.parseInt(prevID[1])) { + if (!geneID[1].equals("")) { + geneID[1] = geneID[1].concat(";"); + geneID[2] = geneID[2].concat(";"); + } + geneID[1] = geneID[1].concat(geneNameSplit[i]); + geneID[2] = geneID[2].concat(geneEndSplit[i]); + } + } + } + if ( geneID[1].equals("") || !geneID[0].equals(prevID[0])) { + geneID[1]= "."; + geneID[2]= "."; + } + return geneID; + } + + public static String checkForOverlappingSpliceSite( String[] prevID, String[] donNames, String[] accNames, int[] donStart, int[] accStart, int[] donEnd, int[] accEnd) { + String withinSS=""; + int startPos = Integer.parseInt(prevID[1]); + int endPos = Integer.parseInt(prevID[1])+prevID[2].length()-1; + if (prevID[3].equals("*")) { + endPos++; + } + for (int i=0; i=donStart[i]&&startPos<=donEnd[i]) { + if (!withinSS.equals("")) { + withinSS=withinSS.concat(","); + } + withinSS = withinSS.concat(donNames[i]).concat("_donor"); + } + } + } + for (int i=0; i=accStart[i]&&startPos<=accEnd[i]) { + if (!withinSS.equals("")) { + withinSS=withinSS.concat(","); + } + withinSS = withinSS.concat(accNames[i]).concat("_acceptor"); + } + } + } + if (withinSS.equals("")) { + withinSS="."; + } + return withinSS; + } + + public static void outputVariantToBuffers(String[] out) { + //write full line for annovar + String avLine = out[0]+"\t"+out[1]+"\t"+out[3]+"\t"+out[4]+"\t"+out[5]+"\t"+out[6]+"\t"+out[15]+"\t"+out[16]+"\t"+out[17]+"\t"+out[18]+"\t"+out[19]+"\t"+out[20]+"\t"+out[21]+"\t"+out[22]; + avBuffer[avBufferIndex] = avLine; + avBufferIndex++; + } + + public static boolean ifGenesOverlap(String[] geneID, String chr, int start) { + //check for overlapping genes + int overlapTest = 0; + //check individually for already overlapping genes + if (geneID[1].concat(geneID[2]).contains(";")) { + String[] geneEndSplit = geneID[2].split(";"); + String[] geneNameSplit = geneID[1].split(";"); + String updatedGeneName = ""; + String updatedGeneEnd = ""; + int geneIndex = Math.min(geneEndSplit.length, geneNameSplit.length); + for (int i=0; i= start) { + overlapTest = Integer.parseInt(geneEndSplit[i]); + if (!updatedGeneName.equals("")) { + updatedGeneName = updatedGeneName.concat(";"); + updatedGeneEnd = updatedGeneEnd.concat(";"); + } + updatedGeneName = updatedGeneName.concat(geneNameSplit[i]); + updatedGeneEnd = updatedGeneEnd.concat(geneEndSplit[i]); + } + } + geneID[1] = updatedGeneName; + geneID[2] = updatedGeneEnd; + } else if (geneID[2].equals(".")) { + overlapTest = 0; + geneID[2]="0"; + } else if (!geneID[2].equals("")) { + overlapTest = Integer.parseInt(geneID[2]); + } + if (start <= overlapTest && geneID[0].equals(chr)) { + return true; + } + return false; + } + +} diff --git a/bin/refCheck.java b/bin/refCheck.java new file mode 100644 index 0000000..8bdf9a0 --- /dev/null +++ b/bin/refCheck.java @@ -0,0 +1,22 @@ +//Description: for generating bed intervals as input to bedtools getfasta +import java.io.*; + +class refCheck { + +public static void main(String[] args) { + try { + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String s = ""; + //process lines + while ((s = in.readLine()) != null) { + String[] split = s.split("\\s+"); + int prefixStart= Integer.parseInt(split[2]) - 91; + int suffixEnd = Integer.parseInt(split[2]) + 89 + split[4].length(); + System.out.println(split[1]+"\t"+Integer.toString(prefixStart)+"\t"+Integer.toString(suffixEnd)+"\t"+split[1]+";"+split[2]+";"+split[4]+";"+split[5]+";"+split[3]+";\t1\t"+split[3]); + } + in.close(); + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + } +} +} diff --git a/bin/seqScan.class b/bin/seqScan.class index 31d48fa..cc44b90 100644 Binary files a/bin/seqScan.class and b/bin/seqScan.class differ diff --git a/bin/seqScan.java b/bin/seqScan.java index 60f19e4..a6001f7 100644 --- a/bin/seqScan.java +++ b/bin/seqScan.java @@ -18,464 +18,482 @@ class seqScan { public static double[] ESStable = createESRarrays("data/ESS.txt"); public static boolean useESR= true; public static String fileID=""; - public static boolean invalidFastaFound = false; public static boolean unsplitVCFfound = false; -public static void main(String[] args) { - - String header = ""; - String s = ""; - if (args.length < 3) { - System.out.println("must provide input file and -useESR/skipESR and fileID as command line argument"); - System.exit(1); - } - if (args[1]=="-skipESR") { - useESR = false; - } - fileID=args[2]; - try{ - fillESRtables(); - //open input file - FileInputStream fstream = new FileInputStream(args[0]); - DataInputStream in = new DataInputStream(fstream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - //process file 2 lines at a time - while ((header = br.readLine()) != null) { - s = br.readLine(); - try { - scanStrings(header, s, args[2]); + public static void main(String[] args) { + + String header = ""; + String s = ""; + if (args.length < 3) { + System.out.println("must provide input file and -useESR/skipESR and fileID as command line argument"); + System.exit(1); } - catch (Exception e){ - e.printStackTrace(); - } - } - in.close(); - }catch (Exception e){ - System.err.println("Error: " + e.getMessage()); + if (args[1]=="-skipESR") { + useESR = false; + } + fileID=args[2]; + try{ + fillESRtables(); + //open input file + FileInputStream fstream = new FileInputStream(args[0]); + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + //process file 2 lines at a time + while ((header = br.readLine()) != null) { + s = br.readLine(); + try { + scanStrings(header, s, args[2]); + } + catch (Exception e){ + e.printStackTrace(); + } + } + in.close(); + }catch (Exception e){ + System.err.println("Error: " + e.getMessage()); + } + //write output to files + appendToFiles(); } - //write output to files - appendToFiles(); -} -public static void scanStrings (String header, String s, String fileID) { - /* SCANNING EXPLANATION - For MaxEntScan, a region of 45bp with variant at middle nucleotide 23 is needed (anywhere ref/alt can contribute to the motif). - Scan for acceptor AG from nucleotide pos 19 to 41, and for donor GT from pos 18 to 26 and generate query strings for scoring - The additional 68bp flanking sequence is necessary for GeneSplicer and filling out MaxEntScan alt strings shortened by deletions. - For indels, alt strings have a different length. Adjustment "correct1" is made so that only ref/alt equivalent positions are scanned. - correct1: only needed on ref string in event of deletions on antisense strand, because start value is sense oriented eg. GAT>GT is represented as 5'-GA>G-3' not 3'-TA>T-5'. - correct2: adjusts for empty alt strings, arising from deletions denoted for example as ref=A, alt=-or* - */ - int flank = 68; - int correct1 = 0; - s = s.toLowerCase(); - String id = header.substring(0, header.indexOf(":")); - String[] sep = id.split(";"); - String chr = sep[0].substring(1); - String start = sep[1]; - String ref = sep[2].toUpperCase(); - String alt = sep[3].toUpperCase(); - String strand = sep[4]; - String mesAdd = ""; - double maxRefESE = 0; - double maxAltESE = 0; - double minRefESS = 0; - double minAltESS = 0; - //ignore deletions larger than flanking region - if (ref.length()>35 || alt.length()>35) { - return; - } - if (strand.equals("+")) { - strand = "POS"; - } - //index correction for deletions notated as alt="*" - String refName = ref; - String altName = alt; - int correct2=0; - if (alt.equals("*")) { - alt=""; - correct2=1; - } - //catch unsplit VCFs (ie. multiple alt alleles represented on one comma separated line) - if (alt.contains(",")) { - if (!unsplitVCFfound) { - System.out.println("Note: unsplit VCF/BED record(s) were detected, ie. variants with multiple alt alleles represented on a single line. Such records are susceptible to false negatives, so we recommend splitting these records onto multiple lines"); - unsplitVCFfound = true; + public static void scanStrings (String header, String s, String fileID) { + /* SCANNING EXPLANATION + For MaxEntScan, a region of 45bp with variant at middle nucleotide 23 is needed (anywhere ref/alt can contribute to the motif). + Scan for acceptor AG from nucleotide pos 19 to 41, and for donor GT from pos 18 to 26 and generate query strings for scoring + The additional 68bp flanking sequence is necessary for GeneSplicer and filling out MaxEntScan alt strings shortened by deletions. + For indels, alt strings have a different length. Adjustment "correct1" is made so that only ref/alt equivalent positions are scanned. +correct1: only needed on ref string in event of deletions on antisense strand, because start value is sense oriented eg. GAT>GT is represented as 5'-GA>G-3' not 3'-TA>T-5'. +correct2: adjusts for empty alt strings, arising from deletions denoted for example as ref=A, alt=-or* + */ + int flank = 68; + int correct1 = 0; + s = s.toLowerCase(); + String id = header.substring(0, header.indexOf(":")); + String[] sep = id.split(";"); + String chr = sep[0].substring(1); + String start = sep[1]; + String ref = sep[2].toUpperCase(); + String alt = sep[3].toUpperCase(); + String strand = sep[4]; + String mesAdd = ""; + double maxRefESE = 0; + double maxAltESE = 0; + double minRefESS = 0; + double minAltESS = 0; + //ignore deletions larger than flanking region + if (ref.length()>35 || alt.length()>35) { + return; } - String[] splitAlt = alt.split(""); - alt = splitAlt[0]; - } - //if negative strand - if (strand.equals("-")) { - //reverse complement alt - id = id.concat("(-)"); - ref = getRC(ref); - alt = getRC(alt); - correct1 = ref.length()-1; - strand = "NEG"; - } - //generate alt string - String altSeq = s.substring(0, 22 + flank) + alt + s.substring(22 + flank + ref.length()); - altSeq = altSeq.toLowerCase(); - //genesplicer fasta strings - gsOutput[gsIndex]=">".concat(strand).concat(chr).concat(":").concat(start).concat(":").concat(refName).concat(":").concat(altName).concat(":GSREF"); - gsIndex++; - gsOutput[gsIndex]=s; - gsIndex++; - gsOutput[gsIndex]=">".concat(strand).concat(chr).concat(":").concat(start).concat(":").concat(refName).concat(":").concat(altName).concat(":GSALT"); - gsIndex++; - gsOutput[gsIndex]=altSeq; - gsIndex++; - //ESRseq - if(useESR) { - for (int i=17+flank; i<23+flank; i++) { - int refIndex = getESRindex(s.substring(i, i+6)); - int altIndex = getESRindex(altSeq.substring(i, i+6)); - if (ESEtable[refIndex]>maxRefESE) { - maxRefESE = ESEtable[refIndex]; - } - if (ESEtable[altIndex]>maxAltESE) { - maxAltESE = ESEtable[altIndex]; - } - if (ESStable[refIndex]1 || correct2==1) { - int j=1; - do { - refIndex = getESRindex(s.substring(i+j, i+j+6)); - if (ESEtable[refIndex]>maxRefESE) { - maxRefESE = ESEtable[refIndex]; - } - if (ESStable[refIndex]maxRefESE) { + maxRefESE = ESEtable[refIndex]; } - while (j1) { - int j=1; - do { - altIndex = getESRindex(altSeq.substring(i+j, i+j+6)); - if (ESEtable[altIndex]>maxAltESE) { - maxAltESE = ESEtable[altIndex]; + if (ESEtable[altIndex]>maxAltESE) { + maxAltESE = ESEtable[altIndex]; + } + if (ESStable[refIndex]1 || correct2==1) { + int j=1; + do { + refIndex = getESRindex(s.substring(i+j, i+j+6)); + if (ESEtable[refIndex]>maxRefESE) { + maxRefESE = ESEtable[refIndex]; + } + if (ESStable[refIndex]1) { + int j=1; + do { + altIndex = getESRindex(altSeq.substring(i+j, i+j+6)); + if (ESEtable[altIndex]>maxAltESE) { + maxAltESE = ESEtable[altIndex]; + } + if (ESStable[altIndex]1 || correct2==1) { - int j=1; - do { - mesOutputAcc[mesIndexAcc]=altSeq.substring(i-18-j, i+5-j).concat(id).concat("ACC").concat("ALT"); + //MaxEntScan + if (containsIllegalFastaChar(nTest, nTest.length())) { + appendOmmittedInvalidFasta(id); + } else { + //scan ref sequence + for (int i=17+flank-correct1; i<41+flank-correct1+ref.length()-1; i++) { + //if acceptor + if (s.substring(i, i+2).equals("ag") && i!=17+flank-correct1) { + //output ref and alt acceptor strings + mesOutputAcc[mesIndexAcc]=s.substring(i-18, i+5).concat(id).concat("ACC").concat("REF"); mesIndexAcc++; - j++; - } - while (j1) { - int j=1; - do { - mesOutputAcc[mesIndexAcc]=altSeq.substring(i-18+j, i+5+j).concat(id).concat("ACC").concat("ALT"); + mesOutputAcc[mesIndexAcc]=altSeq.substring(i-18, i+5).concat(id).concat("ACC").concat("ALT"); mesIndexAcc++; - j++; - } - while (j1 || correct2==1) { - int j=1; - do { - mesOutputDon[mesIndexDon]=altSeq.substring(i-3-j, i+6-j).concat(id).concat("DON").concat("ALT"); + //for indels, output offset alt acceptor string + if (ref.length()>1 || correct2==1) { + int j=1; + do { + mesOutputAcc[mesIndexAcc]=altSeq.substring(i-18-j, i+5-j).concat(id).concat("ACC").concat("ALT"); + mesIndexAcc++; + j++; + } + while (j1) { + int j=1; + do { + mesOutputAcc[mesIndexAcc]=altSeq.substring(i-18+j, i+5+j).concat(id).concat("ACC").concat("ALT"); + mesIndexAcc++; + j++; + } + while (j1) { - int j=1; - do { - mesOutputDon[mesIndexDon]=altSeq.substring(i-3+j, i+6+j).concat(id).concat("DON").concat("ALT"); + mesOutputDon[mesIndexDon]=altSeq.substring(i-3, i+6).concat(id).concat("DON").concat("ALT"); mesIndexDon++; - j++; - } - while (j1) { - int j=1; - do { - mesOutputAcc[mesIndexAcc]=s.substring(i-18-j, i+5-j).concat(id).concat("ACC").concat("REF"); - mesIndexAcc++; - j++; - } - while (j1 || correct2==1) { + int j=1; + do { + mesOutputDon[mesIndexDon]=altSeq.substring(i-3-j, i+6-j).concat(id).concat("DON").concat("ALT"); + mesIndexDon++; + j++; + } + while (j1) { + int j=1; + do { + mesOutputDon[mesIndexDon]=altSeq.substring(i-3+j, i+6+j).concat(id).concat("DON").concat("ALT"); + mesIndexDon++; + j++; + } + while (j1||correct2==1) { - int j=1; - do { - mesOutputAcc[mesIndexAcc]=s.substring(i-18+j, i+5+j).concat(id).concat("ACC").concat("REF"); + //scan alt sequence + for (int i=17+flank; i<41+flank+alt.length()-1; i++) { + //if acceptor + if (altSeq.substring(i, i+2).equals("ag") && i!=17+flank) { + //output ref and alt acceptor strings + mesOutputAcc[mesIndexAcc]=altSeq.substring(i-18, i+5).concat(id).concat("ACC").concat("ALT"); mesIndexAcc++; - j++; + mesOutputAcc[mesIndexAcc]=s.substring(i-18, i+5).concat(id).concat("ACC").concat("REF"); + mesIndexAcc++; + //for indels, output offset ref acceptor string + if (alt.length()>1) { + int j=1; + do { + mesOutputAcc[mesIndexAcc]=s.substring(i-18-j, i+5-j).concat(id).concat("ACC").concat("REF"); + mesIndexAcc++; + j++; + } + while (j1||correct2==1) { + int j=1; + do { + mesOutputAcc[mesIndexAcc]=s.substring(i-18+j, i+5+j).concat(id).concat("ACC").concat("REF"); + mesIndexAcc++; + j++; + } + while (j1) { - int j=1; - do { - mesOutputDon[mesIndexDon]=s.substring(i-3-j, i+6-j).concat(id).concat("DON").concat("REF"); + //if donor + if (altSeq.substring(i, i+2).equals("gt") && i<26+correct1+flank+alt.length()-1) { + //output ref and alt donor strings + mesOutputDon[mesIndexDon]=altSeq.substring(i-3, i+6).concat(id).concat("DON").concat("ALT"); mesIndexDon++; - j++; - } - while (j1 || correct2==1) { - int j=1; - do { - mesOutputDon[mesIndexDon]=s.substring(i-3+j, i+6+j).concat(id).concat("DON").concat("REF"); + mesOutputDon[mesIndexDon]=s.substring(i-3, i+6).concat(id).concat("DON").concat("REF"); mesIndexDon++; - j++; + //for indels, output offset ref donor string + if (alt.length()>1) { + int j=1; + do { + mesOutputDon[mesIndexDon]=s.substring(i-3-j, i+6-j).concat(id).concat("DON").concat("REF"); + mesIndexDon++; + j++; + } + while (j1 || correct2==1) { + int j=1; + do { + mesOutputDon[mesIndexDon]=s.substring(i-3+j, i+6+j).concat(id).concat("DON").concat("REF"); + mesIndexDon++; + j++; + } + while (j 25000 || mesIndexDon > 9000 ) { - appendToFiles(); - resetOutputArrays(); - } else - if (gsIndex > 13000 || ESRoutputIndex > 3250 ) { - appendToFiles(); - resetOutputArrays(); + //Format ESR scores and ouput + String[] ESRoutputScores = { Double.toString(maxRefESE), Double.toString(maxAltESE), Double.toString(minRefESS), Double.toString(minAltESS) }; + for (int k=0; k<4; k++) { + if (ESRoutputScores[k].equals("0.0")) { + ESRoutputScores[k]="."; + } + } + ESRoutput[ESRoutputIndex]=chr+"\t"+start+"\t"+refName+"\t"+altName+"\t"+"ESR"+"\t"+Double.toString(maxRefESE)+"\t"+Double.toString(maxAltESE)+"\t"+Double.toString(minRefESS)+"\t"+Double.toString(minAltESS); + ESRoutputIndex++; + //append arrays to file then reset + if (mesIndexAcc > 25000 || mesIndexDon > 9000 ) { + appendToFiles(); + resetOutputArrays(); + } else + if (gsIndex > 13000 || ESRoutputIndex > 3250 ) { + appendToFiles(); + resetOutputArrays(); + } } -} - -//get reverse complement -public static String getRC (String query) { - String rc = ""; - for (int i=query.length()-1; i>-1; i--) { - if (query.charAt(i)=='A') { - rc = rc.concat("T"); - } - if (query.charAt(i)=='C') { - rc = rc.concat("G"); - } - if (query.charAt(i)=='G') { - rc = rc.concat("C"); - } - if (query.charAt(i)=='T') { - rc = rc.concat("A"); - } - } - return rc; -} -//generate reference arrays for ESRseq scores, with all potential hexamers mapping to a unique index assigned by getESRindex() -public static double[] createESRarrays (String fileName) { - double[] ESRscores = new double[4096]; - try { - File ESRfile = new File(fileName); - BufferedReader br = new BufferedReader(new FileReader(ESRfile)); - String line = ""; - while ((line = br.readLine()) != null) { - String[] split = line.split("\\s+"); - ESRscores[getESRindex(split[0])]=Double.parseDouble(split[1]); + //get reverse complement + public static String getRC (String query) { + String rc = ""; + for (int i=query.length()-1; i>-1; i--) { + if (query.charAt(i)=='A') { + rc = rc.concat("T"); + } + if (query.charAt(i)=='C') { + rc = rc.concat("G"); + } + if (query.charAt(i)=='G') { + rc = rc.concat("C"); + } + if (query.charAt(i)=='T') { + rc = rc.concat("A"); + } } - br.close(); - }catch (Exception e){ - System.err.println("Error: " + e.getMessage()); + return rc; } - return ESRscores; -} -public static void fillESRtables() { - try { - //ESE - File ESEfile = new File("data/ESE.txt"); - BufferedReader ESEbr = new BufferedReader(new FileReader(ESEfile)); - String line = ""; - while ((line = ESEbr.readLine()) != null) { - String[] split = line.split("\\s+"); - ESEtable[getESRindex(split[0])]=Double.parseDouble(split[1]); + //generate reference arrays for ESRseq scores, with all potential hexamers mapping to a unique index assigned by getESRindex() + public static double[] createESRarrays (String fileName) { + double[] ESRscores = new double[4096]; + try { + File ESRfile = new File(fileName); + BufferedReader br = new BufferedReader(new FileReader(ESRfile)); + String line = ""; + while ((line = br.readLine()) != null) { + String[] split = line.split("\\s+"); + ESRscores[getESRindex(split[0])]=Double.parseDouble(split[1]); + } + br.close(); + }catch (Exception e){ + System.err.println("Error: " + e.getMessage()); } - ESEbr.close(); - //ESS - File ESSfile = new File("data/ESS.txt"); - BufferedReader ESSbr = new BufferedReader(new FileReader(ESSfile)); - line = ""; - while ((line = ESSbr.readLine()) != null) { - String[] split = line.split("\\s+"); - ESStable[getESRindex(split[0])]=Double.parseDouble(split[1]); + return ESRscores; + } + + public static void fillESRtables() { + try { + //ESE + File ESEfile = new File("data/ESE.txt"); + BufferedReader ESEbr = new BufferedReader(new FileReader(ESEfile)); + String line = ""; + while ((line = ESEbr.readLine()) != null) { + String[] split = line.split("\\s+"); + ESEtable[getESRindex(split[0])]=Double.parseDouble(split[1]); + } + ESEbr.close(); + //ESS + File ESSfile = new File("data/ESS.txt"); + BufferedReader ESSbr = new BufferedReader(new FileReader(ESSfile)); + line = ""; + while ((line = ESSbr.readLine()) != null) { + String[] split = line.split("\\s+"); + ESStable[getESRindex(split[0])]=Double.parseDouble(split[1]); + } + ESSbr.close(); + }catch (Exception e){ + System.err.println("Error: " + e.getMessage()); } - ESSbr.close(); - }catch (Exception e){ - System.err.println("Error: " + e.getMessage()); } -} -//convert hexamer sequence to integer position in ESR reference score array -public static int getESRindex (String seq) { - seq = seq.toUpperCase(); - double val=0; - for (int i=0; i<6; i++) { - char c = seq.charAt(i); - switch(c) { - case 'A': val = val + 0*Math.pow(4,i); - break; - case 'C': val = val + 1*Math.pow(4,i); - break; - case 'G': val = val + 2*Math.pow(4,i); - break; - case 'T': val = val + 3*Math.pow(4,i); - break; + //convert hexamer sequence to integer position in ESR reference score array + public static int getESRindex (String seq) { + seq = seq.toUpperCase(); + double val=0; + for (int i=0; i<6; i++) { + char c = seq.charAt(i); + switch(c) { + case 'A': val = val + 0*Math.pow(4,i); + break; + case 'C': val = val + 1*Math.pow(4,i); + break; + case 'G': val = val + 2*Math.pow(4,i); + break; + case 'T': val = val + 3*Math.pow(4,i); + break; + } } + return (int)val; } - return (int)val; -} -public static void resetOutputArrays() { - mesOutputAcc = new String[40000]; - mesOutputDon = new String[15000]; - gsOutput = new String[30000]; - ESRoutput = new String[10000]; - gsIndex = 0; mesIndexDon = 0; mesIndexAcc = 0; ESRoutputIndex = 0; - System.gc(); -} + public static void resetOutputArrays() { + mesOutputAcc = new String[40000]; + mesOutputDon = new String[15000]; + gsOutput = new String[30000]; + ESRoutput = new String[10000]; + gsIndex = 0; mesIndexDon = 0; mesIndexAcc = 0; ESRoutputIndex = 0; + System.gc(); + } -public static boolean containsIllegalFastaChar(String s, int i) { - for (int k=0; k -##FILTER= 54.69"> -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##GATKCommandLine= -##GATKCommandLine= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##reference=file:///g/data3/a32/References_and_Databases/hg38.noalt.decoy.bwa/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fa -##source=ApplyVQSR -##source=SelectVariants -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 11182 11194 11195 13245 13530 13531 14225 14226 14227 150790745 152140225 161480689 162100702 162420370 162590491 162590498 162850325 163050830 163050831 163340643 163340842 170370829 170730795 171010857 171010862 172240507 172550744 172790785 172790804 172820818 172820819 37_1 37_2 37_3 39_1 39_2 39_3 46840 46842 46843 46844 47483 7634956 7634962 8181716 F7514182 -chr1 65510 . C G 117.57 PASS AC=1;AF=0.014;AN=70;BaseQRankSum=1.28;ClippingRankSum=0;DP=2027;ExcessHet=6.9804;FS=0;InbreedingCoeff=-0.1494;MLEAC=2;MLEAF=0.029;MQ=8.59;MQRankSum=0.617;NEGATIVE_TRAIN_SITE;QD=4.9;ReadPosRankSum=1.78;SOR=0.976;VQSLOD=-2.904;culprit=MQRankSum;ANNOVAR_DATE=2017-07-17;Func.refGene=intergenic;Gene.refGene=NONE,DDX11L1;GeneDetail.refGene=dist,NONE,dist,1697;ExonicFunc.refGene=-999;AAChange.refGene=-999;EHRF_r90=-999;cytoBand=1p36.33;genomicSuperDups=0.991969,chr15:101906152;E_AF_POPMAX=0;E_AF_TOT=0;G_AF_POPMAX=0.0058064516129;G_AF_TOT=0.00244910454615;C_AF_TOT=0.00244910454615;C_Hom_TOT=7;ExAC_ALL=-999;ExAC_AFR=-999;ExAC_AMR=-999;ExAC_EAS=-999;ExAC_FIN=-999;ExAC_NFE=-999;ExAC_OTH=-999;ExAC_SAS=-999;ExAC_HIGH=-999;ExAC_HOMO=-999;VC_het_cnt=-999;VC_hom_cnt=-999;VC_freq=-999;VC_Total_freq=-999;VC_Total_variant_cnt=-999;VC_Total_allele_cnt=-999;esp6500siv2_all=-999;1000g2015aug_all=-999;clinvar_20150629=-999;UK10K=-999;SIFT_score=-999;SIFT_pred=-999;Polyphen2_HDIV_score=-999;Polyphen2_HDIV_pred=-999;Polyphen2_HVAR_score=-999;Polyphen2_HVAR_pred=-999;LRT_score=-999;LRT_pred=-999;MutationTaster_score=-999;MutationTaster_pred=-999;MutationAssessor_score=-999;MutationAssessor_pred=-999;FATHMM_score=-999;FATHMM_pred=-999;PROVEAN_score=-999;PROVEAN_pred=-999;VEST3_score=-999;CADD_raw=-999;CADD_phred=-999;DANN_score=-999;fathmm-MKL_coding_score=-999;fathmm-MKL_coding_pred=-999;MetaSVM_score=-999;MetaSVM_pred=-999;MetaLR_score=-999;MetaLR_pred=-999;integrated_fitCons_score=-999;integrated_confidence_value=-999;GERP++_RS=-999;phyloP7way_vertebrate=-999;phyloP20way_mammalian=-999;phastCons7way_vertebrate=-999;phastCons20way_mammalian=-999;SiPhy_29way_logOdds=-999;avsnp144=-999;MCAP=-999;gwava_ed=0.49;BP_numREF=-999;BP_numALT=-999;BP_scoreREF=-999;BP_scoreALT=-999;BP_U2REF=-999;BP_U2ALT=-999;GS_REF=-999;GS_ALT=-999;GS_TYPE=-999;MESaccRef=-999;MESaccAlt=-999;MESdonRef=-999;MESdonAlt=-999;spidex_mt=-999;spidex_zs=-999;dbscSNV_ada=-999;dbscSNV_rf=-999;LINSIGHT_score=-999;VARIANT_TYPE=intergenic_._;ALLELE_END GT:AD:DP:GQ:PL ./.:33,0:33:.:0,0,0 ./.:14,0:14:.:0,0,0 0/0:31,0:31:15:0,15,225 0/0:32,0:32:21:0,21,315 0/0:37,0:37:0:0,0,139 0/0:53,0:53:0:0,0,579 0/0:36,0:36:0:0,0,300 0/1:10,14:24:99:126,0,181 0/0:27,0:27:0:0,0,180 0/0:41,0:41:0:0,0,116 0/0:51,0:51:48:0,48,720 0/0:44,0:44:30:0,30,450 ./.:53,0:53:.:0,0,0 0/0:55,0:55:42:0,42,630 0/0:38,0:38:0:0,0,344 ./.:45,0:45:.:0,0,0 0/0:60,0:60:14:0,14,1203 ./.:44,0:44:.:0,0,0 0/0:52,0:52:4:0,4,960 ./.:35,0:35:.:0,0,0 ./.:42,0:42:.:0,0,0 0/0:55,0:55:5:0,5,995 0/0:44,0:44:0:0,0,293 ./.:41,0:41:.:0,0,0 0/0:27,0:27:12:0,12,548 0/0:48,0:48:41:0,41,992 0/0:54,0:54:12:0,12,1098 0/0:39,0:39:0:0,0,13 0/0:36,0:36:0:0,0,648 0/0:30,0:30:0:0,0,262 0/0:25,0:25:0:0,0,266 0/0:62,0:62:0:0,0,706 ./.:58,0:58:.:0,0,0 0/0:53,0:53:0:0,0,126 0/0:59,0:59:15:0,15,1099 0/0:50,0:50:37:0,37,1034 0/0:39,0:39:0:0,0,504 0/0:58,0:58:30:0,30,450 0/0:53,0:53:24:0,24,360 0/0:56,0:56:48:0,48,720 0/0:44,0:44:0:0,0,537 0/0:43,0:43:21:0,21,315 ./.:53,0:53:.:0,0,0 0/0:51,0:51:39:0,39,585 ./.:56,0:56:.:0,0,0 0/0:35,0:35:9:0,9,135 -chr1 66808 rs201752861 T G 190.06 VQSRTrancheSNP99.95to100.00 AC=1;AF=0.024;AN=42;BaseQRankSum=-0.328;ClippingRankSum=0;DB;DP=1850;ExcessHet=4.7712;FS=0;InbreedingCoeff=0.0386;MLEAC=6;MLEAF=0.143;MQ=3.51;MQRankSum=-0.484;OLD_MULTIALLELIC=chr1:10177:A/*/C;QD=6.13;ReadPosRankSum=-0.652;SOR=0.763;VQSLOD=-18.46;culprit=MQ;ANNOVAR_DATE=2017-07-17;Func.refGene=intergenic;Gene.refGene=NONE,DDX11L1;GeneDetail.refGene=dist,NONE,dist,1697;ExonicFunc.refGene=-999;AAChange.refGene=-999;EHRF_r90=-999;cytoBand=1p36.33;genomicSuperDups=0.991969,chr15:101906152;E_AF_POPMAX=-999;E_AF_TOT=-999;G_AF_POPMAX=-999;G_AF_TOT=-999;C_AF_TOT=-999;C_Hom_TOT=-999;ExAC_ALL=-999;ExAC_AFR=-999;ExAC_AMR=-999;ExAC_EAS=-999;ExAC_FIN=-999;ExAC_NFE=-999;ExAC_OTH=-999;ExAC_SAS=-999;ExAC_HIGH=-999;ExAC_HOMO=-999;VC_het_cnt=-999;VC_hom_cnt=-999;VC_freq=-999;VC_Total_freq=-999;VC_Total_variant_cnt=-999;VC_Total_allele_cnt=-999;esp6500siv2_all=-999;1000g2015aug_all=-999;clinvar_20150629=-999;UK10K=-999;SIFT_score=-999;SIFT_pred=-999;Polyphen2_HDIV_score=-999;Polyphen2_HDIV_pred=-999;Polyphen2_HVAR_score=-999;Polyphen2_HVAR_pred=-999;LRT_score=-999;LRT_pred=-999;MutationTaster_score=-999;MutationTaster_pred=-999;MutationAssessor_score=-999;MutationAssessor_pred=-999;FATHMM_score=-999;FATHMM_pred=-999;PROVEAN_score=-999;PROVEAN_pred=-999;VEST3_score=-999;CADD_raw=-999;CADD_phred=-999;DANN_score=-999;fathmm-MKL_coding_score=-999;fathmm-MKL_coding_pred=-999;MetaSVM_score=-999;MetaSVM_pred=-999;MetaLR_score=-999;MetaLR_pred=-999;integrated_fitCons_score=-999;integrated_confidence_value=-999;GERP++_RS=-999;phyloP7way_vertebrate=-999;phyloP20way_mammalian=-999;phastCons7way_vertebrate=-999;phastCons20way_mammalian=-999;SiPhy_29way_logOdds=-999;avsnp144=-999;MCAP=-999;gwava_ed=0.49;BP_numREF=-999;BP_numALT=-999;BP_scoreREF=-999;BP_scoreALT=-999;BP_U2REF=-999;BP_U2ALT=-999;GS_REF=-999;GS_ALT=-999;GS_TYPE=-999;MESaccRef=-999;MESaccAlt=-999;MESdonRef=-999;MESdonAlt=-999;spidex_mt=-999;spidex_zs=-999;dbscSNV_ada=-999;dbscSNV_rf=-999;LINSIGHT_score=-999;VARIANT_TYPE=intergenic_._;ALLELE_END GT:AD:DP:GQ:PL ./.:33,0:33:.:0,0,0 ./.:14,0:14:.:0,0,0 ./.:31,0:31:.:0,0,0 0/0:29,0:29:0:0,0,99 0/0:37,0:37:0:0,0,139 0/0:53,0:53:0:0,0,41 ./.:28,0:28:.:0,0,0 0/1:10,14:24:99:126,0,181 0/0:25,0:25:0:0,0,57 0/0:41,0:41:0:0,0,116 0/0:49,0:49:0:0,0,122 ./.:34,0:34:.:0,0,0 ./.:53,0:53:.:0,0,0 0/0:50,0:50:0:0,0,142 0/0:34,0:34:0:0,0,175 ./.:45,0:45:.:0,0,0 0/0:50,0:50:0:0,0,67 ./.:44,0:44:.:0,0,0 0/.:4,0:7:36:36,47,126 ./.:35,0:35:.:0,0,0 ./.:42,0:42:.:0,0,0 ./.:45,0:45:.:0,0,0 ./.:40,0:40:.:0,0,0 ./.:41,0:41:.:0,0,0 ./.:24,0:24:.:0,0,0 0/0:47,0:47:0:0,0,8 0/0:48,0:48:0:0,0,231 0/0:39,0:39:0:0,0,13 ./.:29,0:29:.:0,0,0 0/0:24,0:24:0:0,0,62 0/0:21,0:21:0:0,0,24 ./.:54,0:54:.:0,0,0 ./.:58,0:58:.:0,0,0 0/0:53,0:53:0:0,0,126 ./.:56,0:56:.:0,0,0 0/0:43,0:43:0:0,0,133 0/0:38,0:38:0:0,0,105 ./.:49,0:49:.:0,0,0 ./.:46,0:46:.:0,0,0 0/0:45,0:45:0:0,0,216 ./.:40,0:40:.:0,0,0 ./.:43,0:43:.:0,0,0 ./.:53,0:53:.:0,0,0 ./.:50,0:50:.:0,0,0 ./.:56,0:56:.:0,0,0 0/0:34,0:34:0:0,0,73 -chr1 67519 rs201752861 A G 190.06 VQSRTrancheSNP99.95to100.00 AC=1;AF=0.024;AN=42;BaseQRankSum=-0.328;ClippingRankSum=0;DB;DP=1850;ExcessHet=4.7712;FS=0;InbreedingCoeff=0.0386;MLEAC=6;MLEAF=0.143;MQ=3.51;MQRankSum=-0.484;OLD_MULTIALLELIC=chr1:10177:A/*/C;QD=6.13;ReadPosRankSum=-0.652;SOR=0.763;VQSLOD=-18.46;culprit=MQ;ANNOVAR_DATE=2017-07-17;Func.refGene=intergenic;Gene.refGene=NONE,DDX11L1;GeneDetail.refGene=dist,NONE,dist,1697;ExonicFunc.refGene=-999;AAChange.refGene=-999;EHRF_r90=-999;cytoBand=1p36.33;genomicSuperDups=0.991969,chr15:101906152;E_AF_POPMAX=-999;E_AF_TOT=-999;G_AF_POPMAX=-999;G_AF_TOT=-999;C_AF_TOT=-999;C_Hom_TOT=-999;ExAC_ALL=-999;ExAC_AFR=-999;ExAC_AMR=-999;ExAC_EAS=-999;ExAC_FIN=-999;ExAC_NFE=-999;ExAC_OTH=-999;ExAC_SAS=-999;ExAC_HIGH=-999;ExAC_HOMO=-999;VC_het_cnt=-999;VC_hom_cnt=-999;VC_freq=-999;VC_Total_freq=-999;VC_Total_variant_cnt=-999;VC_Total_allele_cnt=-999;esp6500siv2_all=-999;1000g2015aug_all=-999;clinvar_20150629=-999;UK10K=-999;SIFT_score=-999;SIFT_pred=-999;Polyphen2_HDIV_score=-999;Polyphen2_HDIV_pred=-999;Polyphen2_HVAR_score=-999;Polyphen2_HVAR_pred=-999;LRT_score=-999;LRT_pred=-999;MutationTaster_score=-999;MutationTaster_pred=-999;MutationAssessor_score=-999;MutationAssessor_pred=-999;FATHMM_score=-999;FATHMM_pred=-999;PROVEAN_score=-999;PROVEAN_pred=-999;VEST3_score=-999;CADD_raw=-999;CADD_phred=-999;DANN_score=-999;fathmm-MKL_coding_score=-999;fathmm-MKL_coding_pred=-999;MetaSVM_score=-999;MetaSVM_pred=-999;MetaLR_score=-999;MetaLR_pred=-999;integrated_fitCons_score=-999;integrated_confidence_value=-999;GERP++_RS=-999;phyloP7way_vertebrate=-999;phyloP20way_mammalian=-999;phastCons7way_vertebrate=-999;phastCons20way_mammalian=-999;SiPhy_29way_logOdds=-999;avsnp144=-999;MCAP=-999;gwava_ed=0.49;BP_numREF=-999;BP_numALT=-999;BP_scoreREF=-999;BP_scoreALT=-999;BP_U2REF=-999;BP_U2ALT=-999;GS_REF=-999;GS_ALT=-999;GS_TYPE=-999;MESaccRef=-999;MESaccAlt=-999;MESdonRef=-999;MESdonAlt=-999;spidex_mt=-999;spidex_zs=-999;dbscSNV_ada=-999;dbscSNV_rf=-999;LINSIGHT_score=-999;VARIANT_TYPE=intergenic_._;ALLELE_END GT:AD:DP:GQ:PL ./.:33,0:33:.:0,0,0 ./.:14,0:14:.:0,0,0 ./.:31,0:31:.:0,0,0 0/0:29,0:29:0:0,0,99 0/0:37,0:37:0:0,0,139 0/0:53,0:53:0:0,0,41 ./.:28,0:28:.:0,0,0 0/1:10,14:24:99:126,0,181 0/0:25,0:25:0:0,0,57 0/0:41,0:41:0:0,0,116 0/0:49,0:49:0:0,0,122 ./.:34,0:34:.:0,0,0 ./.:53,0:53:.:0,0,0 0/0:50,0:50:0:0,0,142 0/0:34,0:34:0:0,0,175 ./.:45,0:45:.:0,0,0 0/0:50,0:50:0:0,0,67 ./.:44,0:44:.:0,0,0 0/.:4,0:7:36:36,47,126 ./.:35,0:35:.:0,0,0 ./.:42,0:42:.:0,0,0 ./.:45,0:45:.:0,0,0 ./.:40,0:40:.:0,0,0 ./.:41,0:41:.:0,0,0 ./.:24,0:24:.:0,0,0 0/0:47,0:47:0:0,0,8 0/0:48,0:48:0:0,0,231 0/0:39,0:39:0:0,0,13 ./.:29,0:29:.:0,0,0 0/0:24,0:24:0:0,0,62 0/0:21,0:21:0:0,0,24 ./.:54,0:54:.:0,0,0 ./.:58,0:58:.:0,0,0 0/0:53,0:53:0:0,0,126 ./.:56,0:56:.:0,0,0 0/0:43,0:43:0:0,0,133 0/0:38,0:38:0:0,0,105 ./.:49,0:49:.:0,0,0 ./.:46,0:46:.:0,0,0 0/0:45,0:45:0:0,0,216 ./.:40,0:40:.:0,0,0 ./.:43,0:43:.:0,0,0 ./.:53,0:53:.:0,0,0 ./.:50,0:50:.:0,0,0 ./.:56,0:56:.:0,0,0 0/0:34,0:34:0:0,0,73 -chr1 69015 . C T 117.57 PASS AC=1;AF=0.014;AN=70;BaseQRankSum=1.28;ClippingRankSum=0;DP=2027;ExcessHet=6.9804;FS=0;InbreedingCoeff=-0.1494;MLEAC=2;MLEAF=0.029;MQ=8.59;MQRankSum=0.617;NEGATIVE_TRAIN_SITE;QD=4.9;ReadPosRankSum=1.78;SOR=0.976;VQSLOD=-2.904;culprit=MQRankSum;ANNOVAR_DATE=2017-07-17;Func.refGene=intergenic;Gene.refGene=NONE,DDX11L1;GeneDetail.refGene=dist,NONE,dist,1697;ExonicFunc.refGene=-999;AAChange.refGene=-999;EHRF_r90=-999;cytoBand=1p36.33;genomicSuperDups=0.991969,chr15:101906152;E_AF_POPMAX=0;E_AF_TOT=0;G_AF_POPMAX=0.0058064516129;G_AF_TOT=0.00244910454615;C_AF_TOT=0.00244910454615;C_Hom_TOT=7;ExAC_ALL=-999;ExAC_AFR=-999;ExAC_AMR=-999;ExAC_EAS=-999;ExAC_FIN=-999;ExAC_NFE=-999;ExAC_OTH=-999;ExAC_SAS=-999;ExAC_HIGH=-999;ExAC_HOMO=-999;VC_het_cnt=-999;VC_hom_cnt=-999;VC_freq=-999;VC_Total_freq=-999;VC_Total_variant_cnt=-999;VC_Total_allele_cnt=-999;esp6500siv2_all=-999;1000g2015aug_all=-999;clinvar_20150629=-999;UK10K=-999;SIFT_score=-999;SIFT_pred=-999;Polyphen2_HDIV_score=-999;Polyphen2_HDIV_pred=-999;Polyphen2_HVAR_score=-999;Polyphen2_HVAR_pred=-999;LRT_score=-999;LRT_pred=-999;MutationTaster_score=-999;MutationTaster_pred=-999;MutationAssessor_score=-999;MutationAssessor_pred=-999;FATHMM_score=-999;FATHMM_pred=-999;PROVEAN_score=-999;PROVEAN_pred=-999;VEST3_score=-999;CADD_raw=-999;CADD_phred=-999;DANN_score=-999;fathmm-MKL_coding_score=-999;fathmm-MKL_coding_pred=-999;MetaSVM_score=-999;MetaSVM_pred=-999;MetaLR_score=-999;MetaLR_pred=-999;integrated_fitCons_score=-999;integrated_confidence_value=-999;GERP++_RS=-999;phyloP7way_vertebrate=-999;phyloP20way_mammalian=-999;phastCons7way_vertebrate=-999;phastCons20way_mammalian=-999;SiPhy_29way_logOdds=-999;avsnp144=-999;MCAP=-999;gwava_ed=0.49;BP_numREF=-999;BP_numALT=-999;BP_scoreREF=-999;BP_scoreALT=-999;BP_U2REF=-999;BP_U2ALT=-999;GS_REF=-999;GS_ALT=-999;GS_TYPE=-999;MESaccRef=-999;MESaccAlt=-999;MESdonRef=-999;MESdonAlt=-999;spidex_mt=-999;spidex_zs=-999;dbscSNV_ada=-999;dbscSNV_rf=-999;LINSIGHT_score=-999;VARIANT_TYPE=intergenic_._;ALLELE_END GT:AD:DP:GQ:PL ./.:33,0:33:.:0,0,0 ./.:14,0:14:.:0,0,0 0/0:31,0:31:15:0,15,225 0/0:32,0:32:21:0,21,315 0/0:37,0:37:0:0,0,139 0/0:53,0:53:0:0,0,579 0/0:36,0:36:0:0,0,300 0/1:10,14:24:99:126,0,181 0/0:27,0:27:0:0,0,180 0/0:41,0:41:0:0,0,116 0/0:51,0:51:48:0,48,720 0/0:44,0:44:30:0,30,450 ./.:53,0:53:.:0,0,0 0/0:55,0:55:42:0,42,630 0/0:38,0:38:0:0,0,344 ./.:45,0:45:.:0,0,0 0/0:60,0:60:14:0,14,1203 ./.:44,0:44:.:0,0,0 0/0:52,0:52:4:0,4,960 ./.:35,0:35:.:0,0,0 ./.:42,0:42:.:0,0,0 0/0:55,0:55:5:0,5,995 0/0:44,0:44:0:0,0,293 ./.:41,0:41:.:0,0,0 0/0:27,0:27:12:0,12,548 0/0:48,0:48:41:0,41,992 0/0:54,0:54:12:0,12,1098 0/0:39,0:39:0:0,0,13 0/0:36,0:36:0:0,0,648 0/0:30,0:30:0:0,0,262 0/0:25,0:25:0:0,0,266 0/0:62,0:62:0:0,0,706 ./.:58,0:58:.:0,0,0 0/0:53,0:53:0:0,0,126 0/0:59,0:59:15:0,15,1099 0/0:50,0:50:37:0,37,1034 0/0:39,0:39:0:0,0,504 0/0:58,0:58:30:0,30,450 0/0:53,0:53:24:0,24,360 0/0:56,0:56:48:0,48,720 0/0:44,0:44:0:0,0,537 0/0:43,0:43:21:0,21,315 ./.:53,0:53:.:0,0,0 0/0:51,0:51:39:0,39,585 ./.:56,0:56:.:0,0,0 0/0:35,0:35:9:0,9,135 +##fileformat=VCFv4.0 +##assembly=GRCh38/hg38 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 65510 . T G 117.57 PASS . +chr1 66808 . A G 117.57 PASS . +chr1 69015 . C T 117.57 PASS .