-
Notifications
You must be signed in to change notification settings - Fork 0
/
SNPs2nodes.py
executable file
·563 lines (401 loc) · 20.8 KB
/
SNPs2nodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
#!/usr/bin/python3
# Input:
# List of SNPs
# Tree information, including clades (either implicitly or explicitly)
# Tree file (newick format)
#
# Process the above information to identify the most likely root node
# for the phylogenic tree, and information about SNPs that occur between
# multiple phylogenic branches.
#
# Output:
# Homoplasy groups
# ClusterInfo
# "NodeSigCounts" (file referenced on the commandline as last argument)
#
# Import standard Python libraries for argument parsing, interacting with the
# filesystem, and environment and time / date processing.
import argparse as argparse
# Permit use of stdout
import sys
# Help with debug data:
import logging as logging
# Phylogenic tree operations
import Bio.Phylo as Phylo
# Regular expressions for input parsing ... Not a huge deal as long
# as our files aren't insane.
HomoplasticSNPsCountFile = 'COUNT_Homoplastic_SNPs'
HomoplasyGroupsFile = 'Homoplasy_groups'
ClusterInfoFile = 'ClusterInfo'
'''
Template for buffered reading of files:
# Setup for buffered reading of input file
bufferSize = 32 * 1024 # Make this a global, or commandline argument?
inputFile = open(cladesFile, "r")
nextLines = inputFile.readlines(bufferSize)
# Loop setup.
while nextLines != []:
# This line should be all alone ; no other lines indented the same
for line in nextLines:
# This line should be all alone ; no other lines indented the same
# Inner per-line loop.
# This line should be all alone ; no other lines indented the same
nextLines = inputFile.readlines(bufferSize)
# This line should be all alone ; no other lines indented the same
inputFile.close()
'''
def parseCommandline(override=None):
description = '''Generate phylogenetic trees based on SNP data and generated trees. Determine most likely root
of the tree, and output informaiton about that tree and any identified homoplasy groups.'''
example = ''''''
parser = argparse.ArgumentParser(description= description, epilog=example,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('SNPsFile', metavar='SNPsFile', nargs=1,
help='The filename where discovered SNPs can be found.')
parser.add_argument('Tree', metavar='Tree', nargs=1,
help='File containing the computed tree in Newick format. The newly computed tree will use the same filename, with a .rerooted suffix.')
parser.add_argument('TreeOutput', metavar='TreeOutput', nargs=1,
help='The filename for writing the rerooted tree.')
parser.add_argument('OutputSigCounts', metavar='SigCounts', nargs=1,
help='Output filename for information about the tree nodes.')
parser.add_argument('--debug', action='store_true', help='Output diagnostic data to the screen using STDERR')
parser.add_argument('--info', action='store_true', help='Output status information on STDERR', default=True)
parser.add_argument('--quiet', action='store_true', help='Silence debug and other messages. (warnings only)')
if override is None:
return parser.parse_args()
else:
return parser.parse_args(override)
def labelTree(tree):
# Add labels (names) to any tree nodes that are as yet unlabeled. This helps us consistently
# reference nodes even after re-rooting.
counter = 1 # starting label
for node in tree.find_clades():
if node.name is None:
node.name = str(counter)
counter = counter + 1
return tree
def generateClades(treeFile):
# From a given tree, generate all possible clades
# This is an alternative to inputClades()
# This is probably(?) faster than inputClades(), as it doesn't require I/O.
# Load the tree from the source file.
tree = Phylo.read(treeFile, "newick")
logging.debug("Loaded tree from %s", treeFile)
# Generate genomeBits for return.
genomesList = [ node.name for node in tree.find_elements() if node.name is not None ]
genomeBits = {}
maxGenomeBit = 1
for genome in genomesList:
genomeBits[genome] = maxGenomeBit
maxGenomeBit = maxGenomeBit * 2
# Label the tree so we have some sort of (unique, unless a genome has a low integer
# as a name) identifier for each node.
labeledTree = labelTree(tree)
# Generate a list of clades
clades = {}
IDsInNode = {}
allClades = [ node.name for node in labeledTree.find_clades() ]
for root in allClades:
clades[root] = {}
labeledTree.root_with_outgroup(root)
# logging.debug("Tree with root %s:", root)
# logging.debug("%s", str(labeledTree))
for subClade in allClades: # This lets us do root:root, which is the same as _all_... so there's that.
# Add up the genomeBits for all sub-clades that are actually genomes (terminal / leaf nodes)
cladeGenome = sum([ genomeBits.get(clade.name, 0) for clade in labeledTree.find_any(subClade).get_terminals() ])
clades[root][subClade] = cladeGenome
IDsInNode.setdefault(cladeGenome, {})[root] = subClade
# Restore the tree to some semblance of the original state.
labeledTree.root_with_outgroup(labeledTree.find_clades('1')) # There _should_ be a '1'?
logging.debug("Length(clades): %s", len(clades.keys()))
logging.debug("Length(IDsInNode): %s", len(IDsInNode.keys()))
logging.debug("Done reading nodes list")
return clades, IDsInNode, genomeBits, labeledTree
def groupNodes(SNPsFile, clades, idsInNodes, genomeBits):
logging.debug("Starting to group nodes")
### Output variables:
core = {}
group = {}
gl = {}
rootScore = {}
# An overall count (probably equal to greatest SNP loci ID)
lociAddedCount = 0
locusToLocusID = {}
### Symbols
# Loop setup.
tab = '\t'
# Column identifiers
lociID = 0 # The numeric ID used for this loci.
center = 2
loci = 1
position = 3 # Maybe we don't need this?
genome = 4
# This should set every genome bit. If python didn't have unlimited
# width integers, this would probably overflow quickly.
allGenomes = sum(genomeBits.values())
### Internal state
# Ensure these variables have scope to maintain state by declaring
# them outside of the loop.
currentLocus = {}
lastSnpLocus = None
allVariants = {}
# A queue of SNPs pending processing.
snpQueue = {}
# Setup for buffered reading of input file
bufferSize = 32 * 1024 # Make this a global, or commandline argument?
inputFile = open(SNPsFile, "r")
nextLines = inputFile.readlines(bufferSize)
while nextLines != []:
for SNP in nextLines:
# logging.debug("SNP: %s",SNP)
# Inner per-line loop.
snpData = SNP.strip().split(tab)
if len(snpData) == 1:
# Empty line
continue
snpCenter = snpData[center]
snpLocus = snpData[loci]
snpLocusID = snpData[lociID]
snpPosition = snpData[position]
snpGenome = snpData[genome]
thisBit = genomeBits[snpGenome]
# Add the current SNP to our queue of SNPs to handle.
snpQueue.setdefault(snpLocus,[]).append([snpCenter, snpLocusID, snpPosition, snpGenome, thisBit])
nextLines = inputFile.readlines(bufferSize)
# See if we have reached the end of the file. If so, there is no "current" SNP, so
# we will finish out the queue. We do this by setting snpLocus to a value not in the queue
# so that we match all of the queue items.
if nextLines == []:
snpLocus = ''
# Process all of the SNPs we've fully read so far.
# logging.debug("This queue is: %s", snpQueue)
# We will save the in-progress SNP into the next queue.
newQueue = {}
while snpQueue:
snp, snpData = snpQueue.popitem()
if snp == snpLocus:
newQueue[snp] = snpData
continue # skip over the "current" SNP; it isn't done yet (probably).
# Bookkeeping / counter (maybe grab this from some other data?
lociAddedCount = lociAddedCount + 1
if ( ( lociAddedCount % 10000 ) == 0 ):
logging.debug("Processed %s SNPs, currently on %s", lociAddedCount, snp)
##### Reset for the current (new) SNP.
currentLocus = {}
# We haven't seen any SNPs yet, so the center-to-genome map is empty.
allVariants = {
'A': 0,
'C': 0,
'G': 0,
'T': 0,
'-': 0 # Not sure we keep this one...?
}
# For each genome found for this SNP...
# logging.debug("This snpData is: %s", snpData)
for entry in snpData:
##### Handle the current (new) locus
# currentLocus is a dictionary that for each genome maps a position to a central mer.
currentLocus.setdefault(entry[4], {})[entry[2]] = entry[0]
if entry[0] == '':
entry[0] = '-'
# allVariants is a dictionary that maps a central mer to all genomes sharing that mer.
allVariants[entry[0]] = allVariants.get(entry[0],0) | entry[4]
# If this SNP appears in all genomes, it is a core genome; otherwise
# it is False (not)
representedGenomes = 0
matchingRoots = {}
for variantGenomes in allVariants.values():
if variantGenomes == 0:
continue
representedGenomes = representedGenomes | variantGenomes
group.setdefault(variantGenomes,{})[snp] = True
gl.setdefault(snp,{})[variantGenomes] = True
# Identify roots that have at least one clade that
# matches at least one variant of this locus.
for root in idsInNodes.get(variantGenomes,{}).keys():
matchingRoots[root] = True
core[snp] = representedGenomes == allGenomes
# This is just storing a counting integer; it feels wrong.
# We pull the locus ID out of the first entry since we know we got at least one entry.
locusToLocusID[snp] = snpData[0][1]
for root in matchingRoots.keys():
rootScore[root] = rootScore.get(root,0) + 1
if newQueue:
snpQueue = newQueue
if snpQueue:
logging.debug("Odd; our snpQueue should be empty if we got here... %s", snpQueue)
inputFile.close()
logging.debug("Done grouping nodes")
logging.debug("Core SNPs: %s", len([ value for value in core.values() if value ]))
logging.debug("Groups: %s", len(group.keys()))
logging.debug("Loci pointing to groups: %s", len(gl.keys()))
logging.debug("Roots: %s", len(rootScore.keys()))
logging.debug("Loci: %s", lociAddedCount)
# Returns core, group, gl, rootScore (was: num_map_to_node) and SNPsCount (# of SNPs)
return core, group, gl, rootScore, lociAddedCount, locusToLocusID
def findBestRoot(rootScore):
logging.debug("Starting to find best root")
# This tells python to:
# Break the dict into a list of key, value (root, score) tuples,
# Examine the score (the second ([1]) item of each tuple), and, for the
# largest value, return the entire tuple.
bestScore = max(rootScore.items(), key=lambda item: item[1])
# Then we extract the root (first ([0]) item in the tuple, and
bestRoot = bestScore[0]
# The score (second ([1]) item in the tuple)
bestRootScore = bestScore[1]
logging.debug("Best root: %s", bestRoot)
logging.debug("Best score: %s", bestRootScore)
logging.debug("Done finding best root")
# Returns the best root, and that root's score (or the maximum number
# of node loci associated with a given root)
return bestRoot, bestRootScore
def writeRerootedTree(tree, TreeFileOut, root):
logging.debug("Starting to write rerooted tree")
# tree = Phylo.read(TreeFile, "newick")
# Reroot the tree; this is a part I'm not 100% clear on.
newroot = tree.find_clades(root)
# This is under question.
tree.root_with_outgroup(newroot)
logging.debug("Tree: %s", str(tree))
Phylo.write(tree, TreeFileOut, "newick")
logging.debug("Done writing rerooted tree")
def writeHomoplasticSNPCounts(HomoplasticSNPsCountFile, SNPsCount, maxRootScore):
logging.debug("Starting to write homoplastic SNP counts")
outputFile = open(HomoplasticSNPsCountFile, "w")
outputFile.write("Number_Homoplastic_SNPs: %s\n" % (SNPsCount - maxRootScore))
logging.debug("Number homoplastic SNPs: %s", SNPsCount - maxRootScore)
outputFile.close()
logging.debug("Done writing homoplastic SNP counts")
def genomeCount(genomesId):
# Count the number of bits that are set (one bit represents one genome)
return bin(genomesId).count("1")
def getSortedGenomeBits(genomeBits):
# Retrn a list of genome, bit value pairs, sorted by genome name.
# This is a one-time operation that can permit resolving a genome bit string
# in sorted-genome order in the same amount of time as the msb to lsb or
# lsb to msb order.
return sorted(genomeBits.items(),key = lambda x: x[0])
def getGenomes(genomesId, sortedGenomeBits):
# Return a list of every genome for which the bit is set on in genomesId.
return [genome for genome, bit in sortedGenomeBits if bit & genomesId]
def writeNodeSigCounts(SigCountsFile, clades, root, group, sortedGenomeBits):
logging.debug("Starting to write node sig counts")
outputFile = open(SigCountsFile, "w")
clusters = {}
# This is where we make sure the nodes come out in the correct
# order. This is a string sort, not a numeric sort, so ...? - JN
nodesToWrite = sorted(clades[root].keys())
for node in nodesToWrite:
# logging.debug("writeNodeSigCounts: root: %s => node: %s", root, node)
nodeGenomes = clades[root][node]
numNodeGenomes = genomeCount(nodeGenomes)
if numNodeGenomes == 1:
clusters[nodeGenomes] = "Leaf.Node." + node
else:
clusters[nodeGenomes] = "Internal.Node." + node
# If this set of genomes has no SNPs that map to it, the count is zero.
SNPsCount = len(group.get(nodeGenomes,{}).keys())
outputFile.write("node: %s\tNumberTargets: %s\tNumberSNPs: %s\n" % (node, numNodeGenomes, SNPsCount))
# Write all genome names under this node.
for genome in getGenomes(nodeGenomes, sortedGenomeBits):
outputFile.write(genome + "\n")
outputFile.write("\n")
outputFile.close()
logging.debug("Done writing node sig counts")
return clusters
def writeHomoplasyGroups(HomoplasyGroupsFile, group, IDsInNode, root, clusters, nodeCount, sortedGenomeBits):
logging.debug("Starting to figure out homoplasy groups")
# Start the homoplasy group IDs at the number of nodes so they don't overlap.
homoplasyGroupId = nodeCount
hpIds = {}
hpCount = {}
hpGroup = {}
genomeGroupsByNumberSNPs = [ ( genomes, len(SNPs.keys()) ) for genomes, SNPs in group.items() ]
# Sort the genome groups by number of matching SNPs
genomeGroupsByNumberSNPs.sort(reverse=True, key=lambda genomeInfo: genomeInfo[1])
logging.debug("Starting to write homoplastic SNP counts")
outputFile = open(HomoplasyGroupsFile, "w")
# We needed to sort this.
for groupGenomes, matchingSNPs in genomeGroupsByNumberSNPs:
if IDsInNode.get(groupGenomes,{}).get(root) is None:
# Then this group of genomes is homoplastic (doesn't appear in an existing clade)
if matchingSNPs == 0:
# I don't think we can have an entry in group that doesn't have any SNPs associated.
logging.debug("How did we get here? - JN")
exit(1)
homoplasyGroupId = homoplasyGroupId + 1
idString = "Group." + str(homoplasyGroupId)
clusters[groupGenomes] = idString
outputFile.write("Group: %s\tNumberTargets: %s\tNumberSNPs: %s\n" % (idString, genomeCount(groupGenomes), matchingSNPs ))
for genome in getGenomes(groupGenomes, sortedGenomeBits):
outputFile.write(genome + "\n")
outputFile.write("\n")
if ( ( homoplasyGroupId % 10000 ) == 0 ):
logging.debug("Procesed homoplasy group ID %s with %s SNPs", homoplasyGroupId, matchingSNPs)
outputFile.close()
logging.debug("Done writing homoplasy groups")
return clusters
def writeClusterInfo(ClusterInfoFile, gl, clusters, core, locusToLocusID):
logging.debug("Starting to write cluster info")
outputFile = open(ClusterInfoFile, "w")
outputFile.write("LocusNumber\tContextSeq\tCore\tClusters\n")
# Fun fact; Since these keys were inserted in sorted order, this should output them in sorted order.
# Funner fact: That's BS.
for sequence, locusID in sorted(locusToLocusID.items(), key = lambda x: int(x[1])):
# Below is broken; wrong order.
# for sequence in gl.keys():
printableClusters = []
for genomeGroup in gl[sequence]:
printableClusters.append(clusters[genomeGroup])
outputFile.write("%s\t%s\t%s\t%s\n" % (locusID, sequence, 1 if core[sequence] else 0, " ".join(printableClusters)))
outputFile.close()
logging.debug("Done writing cluster info")
if __name__ == "__main__":
options = parseCommandline()
if options.debug:
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', datefmt='%Y-%m-%dT%H:%M:%S', level=logging.DEBUG)
elif options.info:
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', datefmt='%Y-%m-%dT%H:%M:%S', level=logging.INFO)
else:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARN)
logging.debug('Commandline options, as parsed: %s', str(options))
# Input
SNPsFile = options.SNPsFile[0]
# CladeStructuresFile = options.NodeCladeStructures[0]
TreeFile = options.Tree[0]
TreeFileOut = options.TreeOutput[0]
# Output
SigCountsFile = options.OutputSigCounts[0]
# Processing
logging.info("SNPs2nodes starting.")
### Data structures in use:
# clades: A dictionary that maps a root + a node on the tree with that root to the list of genomes under that node.
# IDsInNodes:
# A dictionary that maps a list of genomes + a root node to the node that is the node farthest from the root that
# is between those nodes (and no others) and the root. It is the inverse of the above dictionary in the sense that
# it permits mapping from lsit of genomes to a node, given a root.
# genomeBits: A mapping from a genome name to a bit field value; Permits translating from a bit field to a list of
# genomes, and back.
# clades, IDsInNodes, genomeBits = inputClades(CladeStructuresFile)
# logging.info("Loaded nodes for tree of %s genomes", len(genomeBits))
clades, IDsInNodes, genomeBits, labeledTree = generateClades(TreeFile) # Should be the same results as above inputClades, but maybe faster?
logging.info("Loaded nodes for tree of %s genomes from %s", len(genomeBits), TreeFile)
core, group, gl, rootScore, SNPsCount, locusToLocusID = groupNodes(SNPsFile, clades, IDsInNodes, genomeBits)
logging.info("Loaded %s SNPs, %s possible roots scored.", SNPsCount, len(rootScore))
root, maxScore = findBestRoot(rootScore)
logging.info("Found the best root (%s) with score %s.", root, maxScore)
writeHomoplasticSNPCounts(HomoplasticSNPsCountFile, SNPsCount, maxScore)
logging.info("Finished writing %s", HomoplasticSNPsCountFile)
sortedGenomeBits = getSortedGenomeBits(genomeBits)
logging.debug("Finished sorting genomes for ordered display.")
clusters = writeNodeSigCounts(SigCountsFile, clades, root, group, sortedGenomeBits)
logging.info("Finished writing %s", SigCountsFile)
nodeCount = len(clades[root]) # The number of distinct nodes within the tree.
clusters = writeHomoplasyGroups(HomoplasyGroupsFile, group, IDsInNodes, root, clusters, nodeCount, sortedGenomeBits)
logging.info("Finished writing %s", HomoplasyGroupsFile)
writeClusterInfo(ClusterInfoFile, gl, clusters, core, locusToLocusID)
logging.info("Finished writing %s", ClusterInfoFile)
writeRerootedTree(labeledTree, TreeFileOut, root)
logging.info("Finished writing %s", TreeFileOut)
logging.info("SNPs2nodes finished.")