-
Notifications
You must be signed in to change notification settings - Fork 0
/
buildtreea
executable file
·200 lines (155 loc) · 7.87 KB
/
buildtreea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/bin/bash
# Want to handle three cases:
# - All SNPs (provided in a single file)
# - Core SNPs (provided in a single file)
# - SNPs that are in a user-selectable fraction of the genomes (provided in a single file)
#
# In the latter two cases, we want to run the second half of the script
# Do this by checking to see if the filename is that of the canonical output for all of the SNPs
###
### # Dependencies
### # Files
### ${1}
### ${1}_matrix
### ${1}_matrix.fasta
###
### # Variables
### SNPLOCISUFFIX
#########################
#########################
### Variables setup
#########################
#########################
# If perfer is not in the $PATH, set it here, or use your performance analysis tool of choice.
PERFER=perfer
SNPSFILE="${1}"
TREETYPE="${2}"
RUNID="${SNPSFILE}.${TREETYPE}"
SNPSMATRIX="${SNPSFILE}_matrix"
SNPSMATRIXFASTA="${SNPSMATRIX}.${SNPLOCISUFFIX}"
NJDISTMATRIX="NJ.dist.matrix"
# These are files generated by parsimonator; names chosen by the program / not under our control except
# that we pass ${SNPSALLOUTPUT} as a commandline option.
PARSIMONATORINFO="RAxML_info.${RUNID}"
PARSIMONATORTREE="RAxML_parsimonyTree.${RUNID}"
INTREESOURCE="intree.source" # File to collect a list of files where input trees are stored. Contents must be concatenated to make $INTREE
INTREE="intree" # Filename expected by 'consense' tool. It will prompt the user if it does not exist.
OUTFILE="outfile" # Filename generated by 'consense' tool. It will bail if this file exists.
OUTTREE="outtree" # Filename generated by 'consense' tool. It will bail if this file exists.
CONSENSEOUTPUT="consense.out" # Filename to hold the terminal output of the 'consense' tool; it is not useful to the user 99% of the time.
RESOLVEDTREE="outtree.resolved"
TREEOUTPUT="tree.${RUNID}.tre"
NODELABELFILE="tree_nodeLabel.${RUNID}.tre"
NODEFILE="nodes.${RUNID}"
NODESNPCOUNTS="Node_SNP_counts.${RUNID}"
NODEPERLHASH="nodes.${RUNID}.perlhash"
TIPSNPCOUNTS="tip_SNP_counts.${RUNID}"
NODELABELFILEREROOTED="${NODELABELFILE}.rerooted"
TIPALLELECOUNTSTREE="tree_tipAlleleCounts.${RUNID}.tre"
ALLELECOUNTSTREE="tree_AlleleCounts.${RUNID}.tre"
TIPALLELECOUNTSNODETREE="tree_tipAlleleCounts.${RUNID}.NodeLabel.tre"
ALLELECOUNTSNODETREE="tree_AlleleCounts.${RUNID}.NodeLabel.tre"
#########################
#########################
### Execution
#########################
#########################
echo "Proccessing SNPs from ${SNPSFILE}"
## Create a SNP matrix and fasta, for inputting to PHYLIP, FastTreeMP or other tools like SplitsTree
if [ -e "${SNPSMATRIXFASTA}" -a -e "${SNPSMATRIX}" ] # if this work is already done...
then
echo "Matrix already created, skipping"
else
"${PERFER}" "${SNPSTOFASTAMATRIX}" "${SNPSFILE}" "${SNPSMATRIXFASTA}" "${SNPSMATRIX}"
fi
echo "Building ${TREETYPE} tree"
case "${TREETYPE}" in
NJ)
# NOTE: This next line can take a long time if there are million+ SNP loci and 100+ genomes.
# SNP_matrix2dist_matrix does loops, so it's slow, should be parallelized. Probably should try
# the PHYLIP program, although scores might be different since i count them as somewhat closer
# if they share a locus but not the allele than if they don't even share the locus. But since
# NJ SNP trees are not accurate anyway, i'm not inclined to spend anymore time since no one should
# use this option.
#
# Above seems clear - JN
"${PERFER}" "${SNPMATRIXTODIST}" "${SNPSMATRIX}" > "${NJDISTMATRIX}"
# Does this program take $NJDISTMATRIX as input? Guessing it does....
"${PERFER}" "${DISTANCETREE}" > "${TREEOUTPUT}"
;;
ML)
"${PERFER}" "${FASTTREEMP}" -nt -pseudo -gamma -gtr "${SNPSMATRIXFASTA}" > "${TREEOUTPUT}"
;;
parsimony)
# Build parsimony tree
# From _the_source_code_ (axml.c, get_args()):
# (side note, filenames are limited to 1k characters. possibility of overflow here, yay C!)
# -s <sequence file> (no info on file format here)
# -p <parsimony seed> (used for randomness?!??)
# -n <runID> (Causes run <runID> to be analyzed. Output data is put into two files: RAxML_parsimonyTree.<runID> and RAxML_info.<info>)
# -N <number of trees> (Number of parsimony trees to compute)
"${PERFER}" "${PARSIMONATOR}" -s "${SNPSMATRIX}" -n "${SNPSFILE}.${TREETYPE}" -N "${NUMTREES}" -p "${PARSIMONYSEED}"
# Next we parse the output of $PARSIMONATOR to report on the generated trees.
BESTPARSIMONYSCORE=`grep "Parsimony tree" "${PARSIMONATORINFO}" | sort -k6 -n | head -1 | awk '{print $6}'`
# Create a list of the best parsimony trees in $INFILE
grep "Parsimony tree" "${PARSIMONATORINFO}" | awk -v "score=${BESTPARSIMONYSCORE}" '$6==score {print $14}' > "${INTREESOURCE}"
NUMBESTPARSIMONYTREES=`cat "${INTREESOURCE}" | wc -l | awk '{print $1}'`
echo "Most parsimonious (shortest, best) trees generated have length: ${BESTPARSIMONYSCORE}"
echo "Number of most parsimonious trees from SNPs_all: ${NUMBESTPARSIMONYTREES}"
# Combine all of the best generated trees into one file named $INTREE
cat "${INTREESOURCE}" | while read FILENAME ; do cat "${FILENAME}" ; done > "${INTREE}"
# Ensure no errors from 'consense' tool due to pre-existing output files.
for FILE in "${OUTTREE}" "${OUTFILE}"
do
[ -e "${FILE}" ] && rm "${FILE}"
done
# Get majority consensus tree
#PHYLIP consense was the only tool i found that forced resolution of every branch. FastTree to give it branch
# lengths will crash if some notes have splits to >2 children. But you need to modify seq.h and phylip.h before
# compiling consense to allow longer names so they don't get truncated
# This command isn't polite; it requires user confirmation in the default case and outputs config details to the
# screen that our users don't care about. - JN
echo "Y\n" | "${CONSENSE}" 2>&1 > "${CONSENSEOUTPUT}"
# Give it branch lengths, optimized for the consensus parsimony tree.
# Input to force_binary_tree is one tree per file.
"${PERFER}" "${FORCEBINARYTREE}" "${OUTTREE}" "${RESOLVEDTREE}" # Resolved tree is the output.
"${PERFER}" "${FASTTREEMP}" -nt -pseudo -nome -mllen -gamma -gtr -intree "${RESOLVEDTREE}" "${SNPSMATRIXFASTA}" > "${TREEOUTPUT}"
;;
*)
echo "ERROR: Don't know how to make a tree of type: ${TREETYPE}."
exit 1
;;
esac
echo "Finding nodes"
"${PERFER}" "${LABELTREENODES}" "${TREEOUTPUT}" > "${NODELABELFILE}"
"${PERFER}" "${TREENODES}" "${NODELABELFILE}" "${NODEFILE}"
if [ -s "${NODELABELFILE}" ] # If the node label file was generated successfully... (no sense proceeding otherwise)
then
echo "Placing SNPs on nodes ${SNPSFILE} tree"
"${PERFER}" "${SNPSTONODES}" "${SNPSFILE}" "${NODEPERLHASH}" "${NODELABELFILE}" "${NODESNPCOUNTS}"
# Rename these output files with a name reflecting the source file used to generate the data.
for FILE in COUNT_Homoplastic_SNPs ClusterInfo Homoplasy_groups
do
if [ -e "${FILE}" ]
then
mv "${FILE}" "${FILE}.${RUNID}"
fi
done
echo "Finished placing SNPs on nodes ${SNPSFILE} ${TREETYPE} tree"
echo -e "name_on_tree\tSNP_counts" > "${TIPSNPCOUNTS}"
grep "node: " "${NODESNPCOUNTS}" | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> "${TIPSNPCOUNTS}"
if [ -s "${NODELABELFILEREROOTED}" ] # If we have a rerooted output, replace the non-rerooted file with it.
then
rm -f "${NODELABELFILE}"
mv -f "${NODELABELFILEREROOTED}" "${NODELABELFILE}"
fi
#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file.
"${PERFER}" "${LABELTREEALLELECOUNT}" "${NODELABELFILE}" "${NODESNPCOUNTS}" "${TIPALLELECOUNTSTREE}" "${ALLELECOUNTSTREE}" 0
"${PERFER}" "${LABELTREEALLELECOUNT}" "${NODELABELFILE}" "${NODESNPCOUNTS}" "${TIPALLELECOUNTSNODETREE}" "${ALLELECOUNTSNODETREE}" 1
else
echo "ERROR: Node label file ${NODELABELFILE} not found! Skipped tree labeling."
fi
# Files generated that are no longer needed:
# nodes.*
# tree_typAlleleCounts.*.NodeLabel.tre
# tree_nodeLabel.*