buildtreea

#!/bin/bash


# Want to handle three cases:
#  - All SNPs (provided in a single file)
#  - Core SNPs (provided in a single file)
#  - SNPs that are in a user-selectable fraction of the genomes (provided in a single file)
#
# In the latter two cases, we want to run the second half of the script
# Do this by checking to see if the filename is that of the canonical output for all of the SNPs

### 
### # Dependencies
### # Files
### ${1}
### ${1}_matrix
### ${1}_matrix.fasta
### 
### # Variables
### SNPLOCISUFFIX

#########################
#########################
### Variables setup
#########################
#########################

# If perfer is not in the $PATH, set it here, or use your performance analysis tool of choice.
PERFER=perfer

SNPSFILE="${1}"
TREETYPE="${2}"

RUNID="${SNPSFILE}.${TREETYPE}"

SNPSMATRIX="${SNPSFILE}_matrix"
SNPSMATRIXFASTA="${SNPSMATRIX}.${SNPLOCISUFFIX}"


NJDISTMATRIX="NJ.dist.matrix"

# These are files generated by parsimonator; names chosen by the program / not under our control except
# that we pass ${SNPSALLOUTPUT} as a commandline option.
PARSIMONATORINFO="RAxML_info.${RUNID}"
PARSIMONATORTREE="RAxML_parsimonyTree.${RUNID}"

INTREESOURCE="intree.source" # File to collect a list of files where input trees are stored.  Contents must be concatenated to make $INTREE

INTREE="intree" # Filename expected by 'consense' tool.  It will prompt the user if it does not exist.
OUTFILE="outfile" # Filename generated by 'consense' tool.  It will bail if this file exists.
OUTTREE="outtree" # Filename generated by 'consense' tool.  It will bail if this file exists.
CONSENSEOUTPUT="consense.out" # Filename to hold the terminal output of the 'consense' tool; it is not useful to the user 99% of the time.

RESOLVEDTREE="outtree.resolved"

TREEOUTPUT="tree.${RUNID}.tre"

NODELABELFILE="tree_nodeLabel.${RUNID}.tre"
NODEFILE="nodes.${RUNID}"

NODESNPCOUNTS="Node_SNP_counts.${RUNID}"
NODEPERLHASH="nodes.${RUNID}.perlhash"
TIPSNPCOUNTS="tip_SNP_counts.${RUNID}"
NODELABELFILEREROOTED="${NODELABELFILE}.rerooted"
TIPALLELECOUNTSTREE="tree_tipAlleleCounts.${RUNID}.tre"
ALLELECOUNTSTREE="tree_AlleleCounts.${RUNID}.tre"
TIPALLELECOUNTSNODETREE="tree_tipAlleleCounts.${RUNID}.NodeLabel.tre"
ALLELECOUNTSNODETREE="tree_AlleleCounts.${RUNID}.NodeLabel.tre"


#########################
#########################
### Execution
#########################
#########################

echo "Proccessing SNPs from ${SNPSFILE}"

## Create a SNP matrix and fasta, for inputting to PHYLIP, FastTreeMP or other tools like SplitsTree
if [ -e "${SNPSMATRIXFASTA}" -a -e "${SNPSMATRIX}" ] # if this work is already done...
then
    echo "Matrix already created, skipping"
else
    "${PERFER}" "${SNPSTOFASTAMATRIX}" "${SNPSFILE}" "${SNPSMATRIXFASTA}" "${SNPSMATRIX}"
fi

echo "Building ${TREETYPE} tree"

case "${TREETYPE}" in
    NJ)
	# NOTE:  This next line can take a long time if there are million+ SNP loci and 100+ genomes.
	# SNP_matrix2dist_matrix does loops, so it's slow, should be parallelized.  Probably should try
	# the PHYLIP program, although scores might be different since i count them as somewhat closer
	# if they share a locus but not the allele than if they don't even share the locus. But since
	# NJ SNP trees are not accurate anyway, i'm not inclined to spend anymore time since no one should
	# use this option.
	#
	# Above seems clear - JN
	"${PERFER}" "${SNPMATRIXTODIST}" "${SNPSMATRIX}" > "${NJDISTMATRIX}"
	# Does this program take $NJDISTMATRIX as input?  Guessing it does....
	"${PERFER}" "${DISTANCETREE}" > "${TREEOUTPUT}"
	;;

    ML)
	"${PERFER}" "${FASTTREEMP}"  -nt -pseudo  -gamma -gtr "${SNPSMATRIXFASTA}" > "${TREEOUTPUT}"
	;;

    parsimony)
	# Build parsimony tree
	# From _the_source_code_ (axml.c, get_args()):
	# (side note, filenames are limited to 1k characters.  possibility of overflow here, yay C!)
	#  -s <sequence file> (no info on file format here)
	#  -p <parsimony seed> (used for randomness?!??)
	#  -n <runID>  (Causes run <runID> to be analyzed.  Output data is put into two files: RAxML_parsimonyTree.<runID> and RAxML_info.<info>)
	#  -N <number of trees> (Number of parsimony trees to compute)
	"${PERFER}" "${PARSIMONATOR}" -s "${SNPSMATRIX}" -n "${SNPSFILE}.${TREETYPE}" -N "${NUMTREES}" -p "${PARSIMONYSEED}"
	
	# Next we parse the output of $PARSIMONATOR to report on the generated trees.
	BESTPARSIMONYSCORE=`grep "Parsimony tree" "${PARSIMONATORINFO}" | sort -k6 -n | head -1 | awk '{print $6}'`
	# Create a list of the best parsimony trees in $INFILE
	grep "Parsimony tree" "${PARSIMONATORINFO}" | awk -v "score=${BESTPARSIMONYSCORE}" '$6==score {print $14}' > "${INTREESOURCE}"
	NUMBESTPARSIMONYTREES=`cat "${INTREESOURCE}" | wc -l | awk '{print $1}'`
	echo "Most parsimonious (shortest, best) trees generated have length: ${BESTPARSIMONYSCORE}"
	echo "Number of most parsimonious trees from SNPs_all: ${NUMBESTPARSIMONYTREES}"
	
	# Combine all of the best generated trees into one file named $INTREE
	cat "${INTREESOURCE}" | while read FILENAME ; do cat "${FILENAME}" ; done > "${INTREE}"
	
	# Ensure no errors from 'consense' tool due to pre-existing output files.
	for FILE in "${OUTTREE}" "${OUTFILE}"
	do
	    [ -e "${FILE}" ] && rm "${FILE}"
	done
	
	# Get majority consensus tree
	
	#PHYLIP consense was the only tool i found  that forced resolution of every branch. FastTree to give it branch
	# lengths will crash if some notes have splits to >2 children. But you need to modify seq.h and phylip.h before
	# compiling consense to allow longer names so they don't get truncated
	# This command isn't polite; it requires user confirmation in the default case and outputs config details to the
	# screen that our users don't care about. - JN
	echo "Y\n" | "${CONSENSE}"  2>&1 > "${CONSENSEOUTPUT}"   
	
	# Give it branch lengths, optimized for the consensus parsimony tree.
	# Input to force_binary_tree is one tree per file.
	"${PERFER}" "${FORCEBINARYTREE}" "${OUTTREE}" "${RESOLVEDTREE}"  # Resolved tree is the output.
	"${PERFER}" "${FASTTREEMP}" -nt -pseudo   -nome -mllen -gamma -gtr -intree "${RESOLVEDTREE}" "${SNPSMATRIXFASTA}" > "${TREEOUTPUT}"

	;;
    
    *)
	echo "ERROR: Don't know how to make a tree of type: ${TREETYPE}."
	exit 1
	;;
esac

echo "Finding nodes"


"${PERFER}" "${LABELTREENODES}" "${TREEOUTPUT}"   >  "${NODELABELFILE}"
"${PERFER}" "${TREENODES}" "${NODELABELFILE}" "${NODEFILE}"
if [ -s "${NODELABELFILE}" ]  # If the node label file was generated successfully... (no sense proceeding otherwise)
then
    
    echo "Placing SNPs on nodes ${SNPSFILE} tree"
    "${PERFER}" "${SNPSTONODES}" "${SNPSFILE}"  "${NODEPERLHASH}" "${NODELABELFILE}"  "${NODESNPCOUNTS}"
    
    # Rename these output files with a name reflecting the source file used to generate the data.
    for FILE in COUNT_Homoplastic_SNPs ClusterInfo Homoplasy_groups
    do
	if [ -e "${FILE}" ]
	then
	    mv "${FILE}" "${FILE}.${RUNID}"
	fi
	
    done
    
    echo "Finished placing SNPs on nodes ${SNPSFILE} ${TREETYPE} tree"

    echo -e "name_on_tree\tSNP_counts" > "${TIPSNPCOUNTS}"
    grep "node: " "${NODESNPCOUNTS}" | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> "${TIPSNPCOUNTS}"
    
    if [ -s "${NODELABELFILEREROOTED}" ] # If we have a rerooted output, replace the non-rerooted file with it.
    then
	rm -f "${NODELABELFILE}"
	mv -f "${NODELABELFILEREROOTED}" "${NODELABELFILE}"
    fi
    
    #rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file.
    
    "${PERFER}" "${LABELTREEALLELECOUNT}"  "${NODELABELFILE}" "${NODESNPCOUNTS}" "${TIPALLELECOUNTSTREE}" "${ALLELECOUNTSTREE}" 0
    "${PERFER}" "${LABELTREEALLELECOUNT}"  "${NODELABELFILE}" "${NODESNPCOUNTS}" "${TIPALLELECOUNTSNODETREE}" "${ALLELECOUNTSNODETREE}" 1
else
    echo "ERROR: Node label file ${NODELABELFILE} not found!  Skipped tree labeling."
fi

# Files generated that are no longer needed:
# nodes.*
# tree_typAlleleCounts.*.NodeLabel.tre
# tree_nodeLabel.*