From 09eb545f574b37fc8dff34bf17b18215c6451aef Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 16 Apr 2023 13:37:01 +0200 Subject: [PATCH 1/2] feat: add new stochastic way of resolving polytomies --- treetime/argument_parser.py | 2 + treetime/merger_models.py | 5 ++ treetime/treetime.py | 135 +++++++++++++++++++++++++++++++++--- treetime/wrappers.py | 4 ++ 4 files changed, 137 insertions(+), 9 deletions(-) diff --git a/treetime/argument_parser.py b/treetime/argument_parser.py index c3871684..f2012e95 100644 --- a/treetime/argument_parser.py +++ b/treetime/argument_parser.py @@ -180,6 +180,8 @@ def add_timetree_args(parser): "distribution in the final round.") parser.add_argument('--keep-polytomies', default=False, action='store_true', help="Don't resolve polytomies using temporal information.") + parser.add_argument('--stochastic-resolve', default=False, action='store_true', + help="Resolve polytomies using a random coalescent tree.") # parser.add_argument('--keep-node-order', default=False, action='store_true', # help="Don't ladderize the tree.") parser.add_argument('--relax',nargs=2, type=float, diff --git a/treetime/merger_models.py b/treetime/merger_models.py index f210648c..27247a67 100644 --- a/treetime/merger_models.py +++ b/treetime/merger_models.py @@ -388,3 +388,8 @@ def skyline_inferred(self, gen=1.0, confidence=False): return skyline, conf else: return skyline, None + + + + + diff --git a/treetime/treetime.py b/treetime/treetime.py index 1872c0bc..5883bbfa 100644 --- a/treetime/treetime.py +++ b/treetime/treetime.py @@ -77,12 +77,13 @@ def _run(self, root=None, infer_gtr=True, relaxed_clock=None, n_iqd = None, resolve_polytomies=True, max_iter=0, Tc=None, fixed_clock_rate=None, time_marginal='never', sequence_marginal=False, branch_length_mode='auto', vary_rate=False, use_covariation=False, tracelog_file=None, - method_anc = 'probabilistic', assign_gamma=None, **kwargs): + method_anc = 'probabilistic', assign_gamma=None, stochastic_resolve=False, + **kwargs): """ Run TreeTime reconstruction. Based on the input parameters, it divides the analysis into semi-independent jobs and conquers them one-by-one, - gradually optimizing the tree given the temporal constarints and leaf + gradually optimizing the tree given the temporal constraints and leaf node sequences. Parameters @@ -111,6 +112,9 @@ def _run(self, root=None, infer_gtr=True, relaxed_clock=None, n_iqd = None, resolve_polytomies : bool If True, attempt to resolve multiple mergers + stochastic_resolve : bool (default False) + Resolve multiple mergers via a random coalescent tree (True) or via greedy optimization + max_iter : int Maximum number of iterations to optimize the tree @@ -149,7 +153,7 @@ def _run(self, root=None, infer_gtr=True, relaxed_clock=None, n_iqd = None, use_covariation : bool, optional default False, if False, rate estimates will be performed using simple - regression ignoring phylogenetic covaration between nodes. If vary_rate is True, + regression ignoring phylogenetic covariation between nodes. If vary_rate is True, use_covariation is true by default method_anc: str, optional @@ -167,7 +171,7 @@ def _run(self, root=None, infer_gtr=True, relaxed_clock=None, n_iqd = None, Returns ------- - TreeTime error/succces code : str + TreeTime error/success code : str return value depending on success or error @@ -279,7 +283,7 @@ def _run(self, root=None, infer_gtr=True, relaxed_clock=None, n_iqd = None, n_resolved=0 if resolve_polytomies: # if polytomies are found, rerun the entire procedure - n_resolved = self.resolve_polytomies() + n_resolved = self.resolve_polytomies(stochastic_resolve=stochastic_resolve) if n_resolved: seq_kwargs['prune_short']=False self.prepare_tree() @@ -567,7 +571,7 @@ def reroot(self, root='least-squares', force_positive=True, covariation=None, cl return new_root - def resolve_polytomies(self, merge_compressed=False, resolution_threshold=0.05): + def resolve_polytomies(self, merge_compressed=False, resolution_threshold=0.05, stochastic_resolve=False): """ Resolve the polytomies on the tree. @@ -581,8 +585,13 @@ def resolve_polytomies(self, merge_compressed=False, resolution_threshold=0.05): Parameters ---------- merge_compressed : bool - If True, keep compressed branches as polytomies. If False, - return a strictly binary tree. + If True, keep compressed branches as polytomies. Applies to greedy resolve + resolution_threshold : float + minimal delta LH to consider for polytomy resolution. Otherwise, keep parent as polytomy + stochastic_resolve : bool + generate a stochastic binary coalescent tree with mutation from the children of + a polytomy. Doesn't necessarily resolve the node fully. This step is stochastic + and different runs will result in different outcomes. Returns -------- @@ -596,7 +605,11 @@ def resolve_polytomies(self, merge_compressed=False, resolution_threshold=0.05): for n in self.tree.find_clades(): if len(n.clades) > 2: prior_n_clades = len(n.clades) - self._poly(n, merge_compressed, resolution_threshold=resolution_threshold) + if stochastic_resolve: + self.generate_subtree(n) + else: + self._poly(n, merge_compressed, resolution_threshold=resolution_threshold) + poly_found+=prior_n_clades - len(n.clades) obsolete_nodes = [n for n in self.tree.find_clades() if len(n.clades)==1 and n.up is not None] @@ -760,6 +773,110 @@ def merge_nodes(source_arr, isall=False): return LH + def generate_subtree(self, parent): + from numpy.random import exponential as exp_dis + L = self.data.full_length + mutation_rate = self.gtr.mu*L + + tmax = parent.time_before_present + branches_by_time = sorted(parent.clades, key=lambda x:x.time_before_present) + # calculate the mutations on branches leading to nodes from the mutation length + # this excludes state chances to ambiguous states + mutations_per_branch = {b.name:round(b.mutation_length*L) for b in branches_by_time} + + branches_alive=branches_by_time[:1] + branches_to_come = branches_by_time[1:] + t = branches_alive[-1].time_before_present + if t>=tmax: + # no time left -- keep everything as individual children. + return + + # if there is no coalescent model, assume a rate that would typically coalesce all tips + # in the time window between the latest and the parent node. + dummy_coalescent_rate = 2.0/(tmax-t) + self.logger(f"TreeTime.generate_subtree: node {parent.name} has {len(branches_by_time)} children." + +f" {len([b for b,k in mutations_per_branch.items() if k>0])} have mutations." + +f" The time window for coalescence is {tmax-t:1.4e}",3) + + # loop until time collides with the parent node or all but two branches have been dealt with + # the remaining two would be the children of the parent + while len(branches_alive)+len(branches_to_come)>2 and t advance to next branch + if (total_mut_rate + total_coalescent_rate)==0 and len(branches_to_come): + branches_alive.append(branches_to_come.pop(0)) + t = branches_alive[-1].time_before_present + continue + + # determine the next time step + total_rate_inv = 1.0/(total_mut_rate + total_coalescent_rate) + dt = exp_dis(total_rate_inv) + t+=dt + # if the time advanced past the next branch in the branches_to_come list + # add this branch to branches alive and re-renter the loop + if len(branches_to_come) and t>branches_to_come[0].time_before_present: + while len(branches_to_come) and t>branches_to_come[0].time_before_present: + branches_alive.append(branches_to_come.pop(0)) + # else mutate or coalesce + else: + # determine whether to mutate or coalesce + p = np.random.random() + mut_or_coal = p Date: Sun, 16 Apr 2023 13:48:04 +0200 Subject: [PATCH 2/2] docs: add to change-log --- changelog.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index 0f0a04a2..e54bf507 100644 --- a/changelog.md +++ b/changelog.md @@ -1,7 +1,8 @@ -# 0.9.6: bug fixes +# 0.9.6: bug fixes and new mode of polytomy resolution * in cases when very large polytomies are resolved, the multiplication of the discretized message results in messages/distributions of length 1. This resulted in an error, since interpolation objects require at least two points. This is now caught and a small discrete grid created. * increase recursion limit to 10000 by default. The recursion limit can now also be set via the environment variable `TREETIME_RECURSION_LIMIT`. * removed unused imports, fixed typos + * add new way to resolve polytomies. the previous polytomy resolution greedily pulled out pairs of child-clades at a time and merged then into a single clade. This often results in atypical caterpillar like subtrees. This is undesirable since it (i) is very atypical, (ii) causes numerical issues due to repeated convolutions, and (iii) triggers recursion errors during newick export. The new optional way of resolving replaces a multi-furcation by a randomly generated coalescent tree that backwards in time mutates (all mutations are singletons and need to 'go' before coalescence), and merges lineages. Lineages that remain when time reaches the time of the parent remain as children of the parent. This new way of resolving is much faster for large polytomies. This experimental feature can be used via the flag `--stochastic-resolve`. Note that the outcome of this stochastic resolution is stochastic! # 0.9.5: load custom GTR via CLI