SNPs2nodes.py

#!/usr/bin/python3

# Input:
# List of SNPs
# Tree information, including clades (either implicitly or explicitly)
# Tree file (newick format)
#
# Process the above information to identify the most likely root node
# for the phylogenic tree, and information about SNPs that occur between
# multiple phylogenic branches.
# 
# Output:
# Homoplasy groups
# ClusterInfo
# "NodeSigCounts" (file referenced on the commandline as last argument)
#

# Import standard Python libraries for argument parsing, interacting with the
# filesystem, and environment and time / date processing.
import argparse as argparse

# Permit use of stdout
import sys

# Help with debug data:
import logging as logging

# Phylogenic tree operations
import Bio.Phylo as Phylo

# Regular expressions for input parsing ... Not a huge deal as long
# as our files aren't insane.

HomoplasticSNPsCountFile = 'COUNT_Homoplastic_SNPs'
HomoplasyGroupsFile = 'Homoplasy_groups'
ClusterInfoFile = 'ClusterInfo'

'''
Template for buffered reading of files:
    # Setup for buffered reading of input file
    bufferSize = 32 * 1024 # Make this a global, or commandline argument?
    inputFile = open(cladesFile, "r")
    nextLines = inputFile.readlines(bufferSize)

    # Loop setup.
    
    while nextLines != []:
        # This line should be all alone ; no other lines indented the same
        for line in nextLines:
        # This line should be all alone ; no other lines indented the same

            # Inner per-line loop.

        # This line should be all alone ; no other lines indented the same
        nextLines = inputFile.readlines(bufferSize)
        # This line should be all alone ; no other lines indented the same

    inputFile.close()

'''


def parseCommandline(override=None):

    description = '''Generate phylogenetic trees based on SNP data and generated trees.  Determine most likely root
of the tree, and output informaiton about that tree and any identified  homoplasy groups.'''
    
    example = ''''''
    
    parser = argparse.ArgumentParser(description= description, epilog=example,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('SNPsFile', metavar='SNPsFile', nargs=1,
                        help='The filename where discovered SNPs can be found.')
    
    parser.add_argument('Tree', metavar='Tree', nargs=1,
                        help='File containing the computed tree in Newick format.  The newly computed tree will use the same filename, with a .rerooted suffix.')

    parser.add_argument('TreeOutput', metavar='TreeOutput', nargs=1,
                        help='The filename for writing the rerooted tree.')
    
    parser.add_argument('OutputSigCounts', metavar='SigCounts', nargs=1,
                        help='Output filename for information about the tree nodes.')
    
    parser.add_argument('--debug', action='store_true', help='Output diagnostic data to the screen using STDERR')
    parser.add_argument('--info', action='store_true', help='Output status information on STDERR', default=True)
    parser.add_argument('--quiet', action='store_true', help='Silence debug and other messages. (warnings only)')

    if override is None:
        return parser.parse_args()
    else:
        return parser.parse_args(override)


def labelTree(tree):
    # Add labels (names) to any tree nodes that are as yet unlabeled.  This helps us consistently
    # reference nodes even after re-rooting.
    counter = 1 # starting label
    
    for node in tree.find_clades():
        if node.name is None:
            node.name = str(counter)
            counter = counter + 1

    return tree


def generateClades(treeFile):
    # From a given tree, generate all possible clades
    # This is an alternative to inputClades()
    # This is probably(?) faster than inputClades(), as it doesn't require I/O.

    
    # Load the tree from the source file.
    tree = Phylo.read(treeFile, "newick")
    logging.debug("Loaded tree from %s", treeFile)

    
    # Generate genomeBits for return.
    genomesList = [ node.name for node in tree.find_elements() if node.name is not None ]

    genomeBits = {}
    maxGenomeBit = 1    
    for genome in genomesList:
        genomeBits[genome] = maxGenomeBit
        maxGenomeBit = maxGenomeBit * 2


    # Label the tree so we have some sort of (unique, unless a genome has a low integer
    # as a name) identifier for each node.
    labeledTree = labelTree(tree)


    # Generate a list of clades
    clades = {}
    IDsInNode = {}

    allClades = [ node.name for node in labeledTree.find_clades() ]
    for root in allClades:
        clades[root] = {}
        labeledTree.root_with_outgroup(root)
        # logging.debug("Tree with root %s:", root)
        # logging.debug("%s", str(labeledTree))
        for subClade in allClades:  # This lets us do root:root, which is the same as _all_... so there's that.

            # Add up the genomeBits for all sub-clades that are actually genomes (terminal / leaf nodes)
            cladeGenome = sum([ genomeBits.get(clade.name, 0) for clade in labeledTree.find_any(subClade).get_terminals() ])

            clades[root][subClade] = cladeGenome
            IDsInNode.setdefault(cladeGenome, {})[root] = subClade
    
    # Restore the tree to some semblance of the original state.
    labeledTree.root_with_outgroup(labeledTree.find_clades('1')) # There _should_ be a '1'?

    
    logging.debug("Length(clades): %s", len(clades.keys()))
    
    logging.debug("Length(IDsInNode): %s", len(IDsInNode.keys()))
    
    logging.debug("Done reading nodes list")
    return clades, IDsInNode, genomeBits, labeledTree
    
    
def groupNodes(SNPsFile, clades, idsInNodes, genomeBits):
    logging.debug("Starting to group nodes")

    ### Output variables:
    core = {}
    group = {}
    gl = {}
    rootScore = {}
    # An overall count (probably equal to greatest SNP loci ID)
    lociAddedCount = 0
    locusToLocusID = {}

    ### Symbols
    
    # Loop setup.
    tab = '\t'

    # Column identifiers
    lociID = 0 # The numeric ID used for this loci.
    center = 2
    loci = 1
    position = 3 # Maybe we don't need this?
    genome = 4

    # This should set every genome bit.  If python didn't have unlimited
    # width integers, this would probably overflow quickly.
    allGenomes = sum(genomeBits.values())


    ### Internal state
    
    # Ensure these variables have scope to maintain state by declaring
    # them outside of the loop.
    currentLocus = {}
    lastSnpLocus = None
    allVariants = {}
    # A queue of SNPs pending processing.
    snpQueue = {}
 
    # Setup for buffered reading of input file
    bufferSize = 32 * 1024 # Make this a global, or commandline argument?
    inputFile = open(SNPsFile, "r")
    nextLines = inputFile.readlines(bufferSize)

    while nextLines != []:
        for SNP in nextLines:

            # logging.debug("SNP: %s",SNP)
            # Inner per-line loop.
            snpData = SNP.strip().split(tab)
            if len(snpData) == 1:
                # Empty line
                continue
            snpCenter = snpData[center]
            snpLocus = snpData[loci]
            snpLocusID = snpData[lociID]
            snpPosition = snpData[position]
            snpGenome = snpData[genome]
            thisBit = genomeBits[snpGenome]

            # Add the current SNP to our queue of SNPs to handle.
            snpQueue.setdefault(snpLocus,[]).append([snpCenter, snpLocusID, snpPosition, snpGenome, thisBit])
            
        nextLines = inputFile.readlines(bufferSize)


        # See if we have reached the end of the file.  If so, there is no "current" SNP, so
        # we will finish out the queue.  We do this by setting snpLocus to a value not in the queue
        # so that we match all of the queue items.
        if nextLines == []:
            snpLocus = ''

        # Process all of the SNPs we've fully read so far.
        # logging.debug("This queue is: %s", snpQueue)

        # We will save the in-progress SNP into the next queue.
        newQueue = {}
        while snpQueue:
            snp, snpData = snpQueue.popitem()
            if snp == snpLocus:
                newQueue[snp] = snpData
                continue # skip over the "current" SNP; it isn't done yet (probably).

            # Bookkeeping / counter (maybe grab this from some other data?
            lociAddedCount = lociAddedCount + 1
            if ( ( lociAddedCount % 10000 ) == 0 ):
                logging.debug("Processed %s SNPs, currently on %s", lociAddedCount, snp)


            ##### Reset for the current (new) SNP.
            currentLocus = {}
            
            # We haven't seen any SNPs yet, so the center-to-genome map is empty.
            allVariants = {
                'A': 0,
                'C': 0,
                'G': 0,
                'T': 0,
                '-': 0 # Not sure we keep this one...?
            }

            # For each genome found for this SNP...
            # logging.debug("This snpData is: %s", snpData)
            for entry in snpData:

                ##### Handle the current (new) locus
                # currentLocus is a dictionary that for each genome maps a position to a central mer.
                currentLocus.setdefault(entry[4], {})[entry[2]] = entry[0]
                if entry[0] == '':
                    entry[0] = '-'

                # allVariants is a dictionary that maps a central mer to all genomes sharing that mer.
                allVariants[entry[0]] = allVariants.get(entry[0],0) | entry[4]

            # If this SNP appears in all genomes, it is a core genome; otherwise
            # it is False (not) 
            representedGenomes = 0
            matchingRoots = {}
            for variantGenomes in allVariants.values():
                if variantGenomes == 0:
                    continue

                representedGenomes = representedGenomes | variantGenomes

                group.setdefault(variantGenomes,{})[snp] = True
                gl.setdefault(snp,{})[variantGenomes] = True

                # Identify roots that have at least one clade that
                # matches at least one variant of this locus.
                for root in idsInNodes.get(variantGenomes,{}).keys():
                    matchingRoots[root] = True

                
            core[snp] = representedGenomes == allGenomes

            # This is just storing a counting integer; it feels wrong.
            # We pull the locus ID out of the first entry since we know we got at least one entry.
            locusToLocusID[snp] = snpData[0][1]

            for root in matchingRoots.keys():
                rootScore[root] = rootScore.get(root,0) + 1

        if newQueue:
            snpQueue = newQueue


    if snpQueue:
        logging.debug("Odd; our snpQueue should be empty if we got here... %s", snpQueue)
    inputFile.close()

    logging.debug("Done grouping nodes")

    logging.debug("Core SNPs: %s", len([ value for value in core.values() if value ]))

    logging.debug("Groups: %s", len(group.keys()))

    logging.debug("Loci pointing to groups: %s", len(gl.keys()))

    logging.debug("Roots: %s", len(rootScore.keys()))

    logging.debug("Loci: %s", lociAddedCount)
                  
    
    # Returns core, group, gl, rootScore (was: num_map_to_node) and SNPsCount (# of SNPs)
    return core, group, gl, rootScore, lociAddedCount, locusToLocusID

def findBestRoot(rootScore):
    logging.debug("Starting to find best root")

    # This tells python to:
    #   Break the dict into a list of key, value (root, score) tuples,
    #   Examine the score (the second ([1]) item of each tuple), and, for the
    #     largest value, return the entire tuple.
    bestScore = max(rootScore.items(), key=lambda item: item[1])

    # Then we extract the root (first ([0]) item in the tuple, and
    bestRoot = bestScore[0]
    # The score (second ([1]) item in the tuple)
    bestRootScore = bestScore[1]

    logging.debug("Best root: %s", bestRoot)
    logging.debug("Best score: %s", bestRootScore)
    logging.debug("Done finding best root")
    # Returns the best root, and that root's score (or the maximum number
    # of node loci associated with a given root)

    return bestRoot, bestRootScore

def writeRerootedTree(tree, TreeFileOut, root):
    logging.debug("Starting to write rerooted tree")
    
    # tree = Phylo.read(TreeFile, "newick")

    # Reroot the tree; this is a part I'm not 100% clear on.
    newroot = tree.find_clades(root)
    # This is under question.
    tree.root_with_outgroup(newroot)

    logging.debug("Tree: %s", str(tree))

    Phylo.write(tree, TreeFileOut, "newick")
    
    logging.debug("Done writing rerooted tree")
    
def writeHomoplasticSNPCounts(HomoplasticSNPsCountFile, SNPsCount, maxRootScore):
    logging.debug("Starting to write homoplastic SNP counts")
    outputFile = open(HomoplasticSNPsCountFile, "w")
    outputFile.write("Number_Homoplastic_SNPs: %s\n" % (SNPsCount - maxRootScore))
    logging.debug("Number homoplastic SNPs: %s", SNPsCount - maxRootScore)
    outputFile.close()
    logging.debug("Done writing homoplastic SNP counts")


def genomeCount(genomesId):
    # Count the number of bits that are set (one bit represents one genome)
    return bin(genomesId).count("1")

def getSortedGenomeBits(genomeBits):
    # Retrn a list of genome, bit value pairs, sorted by genome name.
    # This is a one-time operation that can permit resolving a genome bit string
    # in sorted-genome order in the same amount of time as the msb to lsb or
    # lsb to msb order.
    return sorted(genomeBits.items(),key = lambda x: x[0])

def getGenomes(genomesId, sortedGenomeBits):
    # Return a list of every genome for which the bit is set on in genomesId.
    return [genome for genome, bit in sortedGenomeBits if bit & genomesId]


def writeNodeSigCounts(SigCountsFile, clades, root, group, sortedGenomeBits):
    logging.debug("Starting to write node sig counts")
    outputFile = open(SigCountsFile, "w")
    clusters = {}

    # This is where we make sure the nodes come out in the correct
    # order.  This is a string sort, not a numeric sort, so ...? - JN
    nodesToWrite = sorted(clades[root].keys())
    
    for node in nodesToWrite:
        # logging.debug("writeNodeSigCounts: root: %s => node: %s", root, node)
        nodeGenomes = clades[root][node]
        numNodeGenomes = genomeCount(nodeGenomes)
        if numNodeGenomes == 1:
            clusters[nodeGenomes] = "Leaf.Node." + node
        else:
            clusters[nodeGenomes] = "Internal.Node." + node
        # If this set of genomes has no SNPs that map to it, the count is zero.
        SNPsCount = len(group.get(nodeGenomes,{}).keys())
        outputFile.write("node: %s\tNumberTargets: %s\tNumberSNPs: %s\n" % (node, numNodeGenomes, SNPsCount))
        # Write all genome names under this node.
        for genome in getGenomes(nodeGenomes, sortedGenomeBits):
            outputFile.write(genome + "\n")
        outputFile.write("\n")
    
    outputFile.close()
    logging.debug("Done writing node sig counts")
    return clusters
    
def writeHomoplasyGroups(HomoplasyGroupsFile, group, IDsInNode, root, clusters, nodeCount, sortedGenomeBits):
    logging.debug("Starting to figure out homoplasy groups")
    # Start the homoplasy group IDs at the number of nodes so they don't overlap.
    homoplasyGroupId = nodeCount
    hpIds = {}
    hpCount = {}
    hpGroup = {}

    genomeGroupsByNumberSNPs = [ ( genomes, len(SNPs.keys()) ) for genomes, SNPs in group.items() ]

    # Sort the genome groups by number of matching SNPs
    genomeGroupsByNumberSNPs.sort(reverse=True, key=lambda genomeInfo: genomeInfo[1])

    logging.debug("Starting to write homoplastic SNP counts")
    outputFile = open(HomoplasyGroupsFile, "w")

    # We needed to sort this.
    for groupGenomes, matchingSNPs in genomeGroupsByNumberSNPs:
        if IDsInNode.get(groupGenomes,{}).get(root) is None:
            # Then this group of genomes is homoplastic (doesn't appear in an existing clade)
            
            if matchingSNPs == 0:
                # I don't think we can have an entry in group that doesn't have any SNPs associated.
                logging.debug("How did we get here? - JN")
                exit(1)
                
            homoplasyGroupId = homoplasyGroupId + 1
            idString = "Group." + str(homoplasyGroupId)

            clusters[groupGenomes] = idString
            outputFile.write("Group: %s\tNumberTargets: %s\tNumberSNPs: %s\n" % (idString, genomeCount(groupGenomes), matchingSNPs ))
            for genome in getGenomes(groupGenomes, sortedGenomeBits):
                outputFile.write(genome + "\n")
            outputFile.write("\n")             
            if ( ( homoplasyGroupId % 10000 ) == 0 ):
                logging.debug("Procesed homoplasy group ID %s with %s SNPs", homoplasyGroupId, matchingSNPs)

            
    outputFile.close()
    logging.debug("Done writing homoplasy groups")
    return clusters
    

def writeClusterInfo(ClusterInfoFile, gl, clusters, core, locusToLocusID):
    logging.debug("Starting to write cluster info")
    outputFile = open(ClusterInfoFile, "w")
    outputFile.write("LocusNumber\tContextSeq\tCore\tClusters\n")

    # Fun fact; Since these keys were inserted in sorted order, this should output them in sorted order.
    # Funner fact: That's BS.
    for sequence, locusID in sorted(locusToLocusID.items(), key = lambda x: int(x[1])):
        # Below is broken; wrong order.
        # for sequence in gl.keys():
        printableClusters = []
        for genomeGroup in gl[sequence]:
            printableClusters.append(clusters[genomeGroup])
        outputFile.write("%s\t%s\t%s\t%s\n" % (locusID, sequence, 1 if core[sequence] else 0, " ".join(printableClusters)))
    
    outputFile.close()
    logging.debug("Done writing cluster info")
    
                         
if __name__ == "__main__":

    options = parseCommandline()

    if options.debug:
        logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', datefmt='%Y-%m-%dT%H:%M:%S', level=logging.DEBUG)
    elif options.info:
        logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', datefmt='%Y-%m-%dT%H:%M:%S', level=logging.INFO)
    else:
        logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARN)
            

    logging.debug('Commandline options, as parsed: %s', str(options))

    # Input
    SNPsFile = options.SNPsFile[0]
    # CladeStructuresFile = options.NodeCladeStructures[0]
    TreeFile = options.Tree[0]
    TreeFileOut = options.TreeOutput[0]

    # Output
    SigCountsFile = options.OutputSigCounts[0]

          
    # Processing
    logging.info("SNPs2nodes starting.")

    ### Data structures in use:
    # clades:  A dictionary that maps a root + a node on the tree with that root to the list of genomes under that node.
    # IDsInNodes:
    #    A dictionary that maps a list of genomes + a root node to the node that is the node farthest from the root that
    #    is between those nodes (and no others) and the root.  It is the inverse of the above dictionary in the sense that
    #    it permits mapping from lsit of genomes to a node, given a root.
    # genomeBits:  A mapping from a genome name to a bit field value;  Permits translating from a bit field to a list of
    #    genomes, and back.

    # clades, IDsInNodes, genomeBits = inputClades(CladeStructuresFile)
    # logging.info("Loaded nodes for tree of %s genomes",  len(genomeBits))

    clades, IDsInNodes, genomeBits, labeledTree = generateClades(TreeFile) # Should be the same results as above inputClades, but maybe faster?
    logging.info("Loaded nodes for tree of %s genomes from %s",  len(genomeBits), TreeFile)
   

    core, group, gl, rootScore, SNPsCount, locusToLocusID = groupNodes(SNPsFile, clades, IDsInNodes, genomeBits)
    logging.info("Loaded %s SNPs, %s possible roots scored.", SNPsCount, len(rootScore))
          
    root, maxScore = findBestRoot(rootScore)
    logging.info("Found the best root (%s) with score %s.", root, maxScore)
    
    writeHomoplasticSNPCounts(HomoplasticSNPsCountFile, SNPsCount, maxScore)
    logging.info("Finished writing %s", HomoplasticSNPsCountFile)

    sortedGenomeBits = getSortedGenomeBits(genomeBits)
    logging.debug("Finished sorting genomes for ordered display.")

    clusters = writeNodeSigCounts(SigCountsFile, clades, root, group, sortedGenomeBits)
    logging.info("Finished writing %s", SigCountsFile)

    nodeCount = len(clades[root]) # The number of distinct nodes within the tree.

    clusters = writeHomoplasyGroups(HomoplasyGroupsFile, group, IDsInNodes, root, clusters, nodeCount, sortedGenomeBits)
    logging.info("Finished writing %s", HomoplasyGroupsFile)

    writeClusterInfo(ClusterInfoFile, gl, clusters, core, locusToLocusID)
    logging.info("Finished writing %s", ClusterInfoFile)

    writeRerootedTree(labeledTree, TreeFileOut, root)
    logging.info("Finished writing %s", TreeFileOut)

    logging.info("SNPs2nodes finished.")