From a33fd2bb8748bafb07c1673c4ef217a2a545243b Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Tue, 4 Jun 2024 16:10:07 +0100 Subject: [PATCH] Document (and undocument) standard params & tidy calling conventions --- CHANGELOG.md | 3 + docs/historical_samples.md | 2 +- docs/priors.md | 13 +++- docs/python-api.md | 2 +- docs/usage.md | 12 ++- docs/variable_popsize.md | 4 +- tests/test_functions.py | 4 +- tests/test_inference.py | 4 +- tsdate/cli.py | 5 +- tsdate/core.py | 150 +++++++++++++++++++++++++------------ 10 files changed, 132 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abf4e581..4dd91513 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,9 @@ - Json metadata for mean time and variance in the mutation and node tables is now saved with a suitable schema. This means `json.loads()` is no longer needed to read it. +- The `mutation_rate` and `population_size` parameters are now keyword-only, and + therefore these parameter names need to be explicitly typed out. + ## [0.1.6] - 2024-01-07 **Breaking changes** diff --git a/docs/historical_samples.md b/docs/historical_samples.md index c959e14b..a4bc0b32 100644 --- a/docs/historical_samples.md +++ b/docs/historical_samples.md @@ -24,7 +24,7 @@ data which includes *historical samples*, whose time is older that the current generation (i.e. sample nodes with times > 0). -The output of [`tsinfer`](tsinfer:sec_introduction) is valid regardless +The output of [`tsinfer`](https://tskit.dev/tsinfer/) is valid regardless of the inclusion of historical samples, but *dating* such a tree sequence is more complicated. This is because the time scale of a tsinferred tree sequence is uncalibrated, so it is unclear where in time to diff --git a/docs/priors.md b/docs/priors.md index ddfd7526..ef700a90 100644 --- a/docs/priors.md +++ b/docs/priors.md @@ -41,9 +41,14 @@ prior1 = tsdate.build_prior_grid(ts, population_size=N) prior2 = tsdate.build_prior_grid(ts, population_size=N, timepoints=40) prior3 = tsdate.build_prior_grid(ts, population_size=N, prior_distribution="gamma") -ts1 = tsdate.date(ts, mu, priors=prior1) # Identical to tsdate.date(ts, mu, population_size=N) -ts2 = tsdate.date(ts, mu, priors=prior2) # Uses a denser timegrid -ts3 = tsdate.date(ts, mu, priors=prior3) # Approximates the discrete-time priors with a gamma +# Equiv to tsdate.date(ts, mutation_rate=mu, population_size=N) +ts1 = tsdate.date(ts, mutation_rate=mu, priors=prior1) + +# Uses a denser timegrid +ts2 = tsdate.date(ts, mutation_rate=mu, priors=prior2) + +# Approximate discrete-time priors with a gamma +ts3 = tsdate.date(ts, mutation_rate=mu, priors=prior3) ``` See below for more explanation of the interpretation of the parameters passed to @@ -76,7 +81,7 @@ be larger than this number. For instance ```{code-cell} ipython3 timepoints = 10 prior = tsdate.build_prior_grid(ts, population_size=N, timepoints=timepoints) -dated_ts = tsdate.date(ts, mu, priors=prior) +dated_ts = tsdate.date(ts, mutation_rate=mu, priors=prior) print( f"`timepoints`={timepoints}, producing a total of {len(prior.timepoints)}", diff --git a/docs/python-api.md b/docs/python-api.md index 638b5df2..e449e190 100644 --- a/docs/python-api.md +++ b/docs/python-api.md @@ -27,9 +27,9 @@ This page provides formal documentation for the `tsdate` Python API. .. autofunction:: tsdate.date .. autodata:: tsdate.core.estimation_methods :no-value: +.. autofunction:: tsdate.variational_gamma .. autofunction:: tsdate.inside_outside .. autofunction:: tsdate.maximization -.. autofunction:: tsdate.variational_gamma ``` ## Prior and Time Discretisation Options diff --git a/docs/usage.md b/docs/usage.md index 548eb1bb..26bfca88 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -215,15 +215,19 @@ per-bp per-year rate, you will also need to modify the effective population size ```{code-cell} ipython3 import numpy as np popsize = 100 # Diploid population size -mutation_rate_per_gen = 1e-8 +mu = 1e-8 # per generation # By default, dates are in generations -ts_generations = tsdate.date(ts, mutation_rate_per_gen, popsize) +ts_generations = tsdate.date(ts, mutation_rate=mu, population_size=popsize) # To infer dates in years, adjust both the rates and the population size: generation_time = 30 # Years -mutation_rate_per_year = mutation_rate_per_gen / generation_time +mutation_rate_per_year = mu / generation_time ts_years = tsdate.date( - ts, mutation_rate_per_year, popsize * generation_time, time_units="years") + ts, + mutation_rate=mutation_rate_per_year, + population_size=popsize * generation_time, + time_units="years" +) # Check that the inferred node times are identical, just on different scales assert np.allclose(ts_generations.nodes_time, ts_years.nodes_time / generation_time, 5) diff --git a/docs/variable_popsize.md b/docs/variable_popsize.md index 7c3efc1f..d3d6ef13 100644 --- a/docs/variable_popsize.md +++ b/docs/variable_popsize.md @@ -64,7 +64,7 @@ get a poor fit to the true times: ```{code-cell} ipython3 import tsdate -redated_ts = tsdate.date(ts, mutation_rate, population_size=1e4) +redated_ts = tsdate.date(ts, mutation_rate=mutation_rate, population_size=1e4) plot_real_vs_tsdate_times(ts.nodes_time, redated_ts.nodes_time, ts, redated_ts, delta=1000, alpha=0.1) ``` @@ -88,7 +88,7 @@ gives a much better fit to the true times: ```{code-cell} ipython3 prior = tsdate.build_prior_grid(ts, popsize) -redated_ts = tsdate.date(ts, mutation_rate, priors=prior) +redated_ts = tsdate.date(ts, mutation_rate=mutation_rate, priors=prior) plot_real_vs_tsdate_times(ts.nodes_time, redated_ts.nodes_time, ts, redated_ts, delta=1000, alpha=0.1) ``` diff --git a/tests/test_functions.py b/tests/test_functions.py index a6a3d7c7..0757d08b 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -2040,7 +2040,7 @@ def test_historical_samples(self): samples = tsinfer.formats.SampleData.from_tree_sequence(ts) inferred = tsinfer.infer(samples).simplify(filter_sites=False) - dated = date(inferred, 10000, 1e-8) + dated = date(inferred, mutation_rate=1e-8, population_size=10000) sites_time = tsdate.sites_time_from_ts(dated) # Add in the original individual times ind_dated_sd = samples.copy() @@ -2074,7 +2074,7 @@ def test_sampledata(self): ts, use_sites_time=False ) inferred = tsinfer.infer(samples).simplify() - dated = date(inferred, 10000, 1e-8) + dated = date(inferred, population_size=10000, mutation_rate=1e-8) sites_time = tsdate.sites_time_from_ts(dated) sites_bound = samples.min_site_times(individuals_only=True) check_sites_time = np.maximum(sites_time, sites_bound) diff --git a/tests/test_inference.py b/tests/test_inference.py index 54478efc..ba0f377a 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2021-23 Tskit Developers +# Copyright (c) 2021-24 Tskit Developers # Copyright (c) 2020 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -78,7 +78,7 @@ def test_bad_population_size(self): def test_both_ne_and_population_size_specified(self): ts = utility_functions.two_tree_mutation_ts() - with pytest.raises(ValueError, match="may be specified"): + with pytest.raises(ValueError, match="Only provide one of Ne"): tsdate.date( ts, mutation_rate=None, population_size=PopulationSizeHistory(1), Ne=1 ) diff --git a/tsdate/cli.py b/tsdate/cli.py index b30aef00..e6ef4b5f 100644 --- a/tsdate/cli.py +++ b/tsdate/cli.py @@ -1,6 +1,7 @@ # MIT License # -# Copyright (c) 2020 University of Oxford +# Copyright (c) 2024 Tskit Developers +# Copyright (c) 2020-2024 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -281,7 +282,7 @@ def run_date(args): params["ignore_oldest_root"] = args.ignore_oldest # For backwards compat # TODO: remove and error out if ignore_oldest_root is set, # see https://github.com/tskit-dev/tsdate/issues/262 - dated_ts = tsdate.date(ts, args.mutation_rate, **params) + dated_ts = tsdate.date(ts, mutation_rate=args.mutation_rate, **params) dated_ts.dump(args.output) diff --git a/tsdate/core.py b/tsdate/core.py index 819dbc59..af026079 100644 --- a/tsdate/core.py +++ b/tsdate/core.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2021-23 Tskit Developers +# Copyright (c) 2021-24 Tskit Developers # Copyright (c) 2020-21 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -1295,15 +1295,22 @@ def run( def maximization( tree_sequence, *, + mutation_rate, + population_size=None, + priors=None, eps=None, num_threads=None, probability_space=None, # below deliberately undocumented cache_inside=None, + Ne=None, # Other params documented in `.date()` **kwargs, ): """ + maximization(tree_sequence, *, mutation_rate, population_size=None, priors=None,\ + eps=None, num_threads=None, probability_space=None, **kwargs) + Infer dates for nodes in a genealogical graph using the "outside maximization" algorithm. This approximates the marginal posterior distribution of a node's age using an atomic discretization of time (e.g. point masses at particular @@ -1337,6 +1344,24 @@ def maximization( tree sequence, and that they are contemporaneous. :param ~tskit.TreeSequence tree_sequence: The input tree sequence to be dated. + :param float mutation_rate: The estimated mutation rate per unit of genome per + unit time. If provided, the dating algorithm will use a mutation rate clock to + help estimate node dates. Default: ``None`` + :param float or ~demography.PopulationSizeHistory population_size: The estimated + (diploid) effective population size used to construct the (default) conditional + coalescent prior. For a population with constant size, this can be given as a + single value (for example, as commonly estimated by the observed genetic + diversity of the sample divided by four-times the expected mutation rate). + Alternatively, for a population with time-varying size, this can be given + directly as a :class:`~demography.PopulationSizeHistory` object or a parameter + dictionary passed to initialise a :class:`~demography.PopulationSizeHistory` + object. The ``population_size`` parameter is only used when ``priors`` is + ``None``. Conversely, if ``priors`` is not ``None``, no ``population_size`` + value should be specified. + :param tsdate.base.NodeGridValues priors: NodeGridValues object containing the prior + parameters for each node-to-be-dated. Note that different estimation methods may + require different types of prior, as described in the documentation for each + estimation method. :param float eps: The error factor in time difference calculations, and the minimum distance separating parent and child ages in the returned tree sequence. Default: None, treated as 1e-6. @@ -1360,12 +1385,23 @@ def maximization( ``return_likelihood`` is ``True``) The marginal likelihood of the mutation data given the inferred node times. """ + if Ne is not None: + if population_size is not None: + raise ValueError("Only provide one of Ne (deprecated) or population_size") + else: + population_size = Ne if eps is None: eps = 1e-6 if probability_space is None: probability_space = base.LOG - dating_method = MaximizationMethod(tree_sequence, **kwargs) + dating_method = MaximizationMethod( + tree_sequence, + mutation_rate=mutation_rate, + population_size=population_size, + priors=priors, + **kwargs, + ) result = dating_method.run( eps=eps, num_threads=num_threads, @@ -1378,6 +1414,9 @@ def maximization( def inside_outside( tree_sequence, *, + mutation_rate, + population_size=None, + priors=None, eps=None, num_threads=None, outside_standardize=None, @@ -1385,10 +1424,16 @@ def inside_outside( probability_space=None, # below deliberately undocumented cache_inside=False, + # Deprecated params + Ne=None, # Other params documented in `.date()` **kwargs, ): """ + inside_outside(tree_sequence, *, mutation_rate, population_size=None, priors=None,\ + eps=None, num_threads=None, outside_standardize=None, ignore_oldest_root=None,\ + probability_space=None, **kwargs) + Infer dates for nodes in a genealogical graph using the "inside outside" algorithm. This approximates the marginal posterior distribution of a node's age using an atomic discretization of time (e.g. point masses at particular timepoints). @@ -1422,6 +1467,24 @@ def inside_outside( tree sequence, and that they are contemporaneous. :param ~tskit.TreeSequence tree_sequence: The input tree sequence to be dated. + :param float mutation_rate: The estimated mutation rate per unit of genome per + unit time. If provided, the dating algorithm will use a mutation rate clock to + help estimate node dates. Default: ``None`` + :param float or ~demography.PopulationSizeHistory population_size: The estimated + (diploid) effective population size used to construct the (default) conditional + coalescent prior. For a population with constant size, this can be given as a + single value (for example, as commonly estimated by the observed genetic + diversity of the sample divided by four-times the expected mutation rate). + Alternatively, for a population with time-varying size, this can be given + directly as a :class:`~demography.PopulationSizeHistory` object or a parameter + dictionary passed to initialise a :class:`~demography.PopulationSizeHistory` + object. The ``population_size`` parameter is only used when ``priors`` is + ``None``. Conversely, if ``priors`` is not ``None``, no ``population_size`` + value should be specified. + :param tsdate.base.NodeGridValues priors: NodeGridValues object containing the prior + parameters for each node-to-be-dated. Note that different estimation methods may + require different types of prior, as described in the documentation for each + estimation method. :param float eps: The error factor in time difference calculations, and the minimum distance separating parent and child ages in the returned tree sequence. Default: None, treated as 1e-6. @@ -1464,6 +1527,11 @@ def inside_outside( ``return_likelihood`` is ``True``) The marginal likelihood of the mutation data given the inferred node times. """ + if Ne is not None: + if population_size is not None: + raise ValueError("Only provide one of Ne (deprecated) or population_size") + else: + population_size = Ne if eps is None: eps = 1e-6 if probability_space is None: @@ -1472,7 +1540,13 @@ def inside_outside( outside_standardize = True if ignore_oldest_root is None: ignore_oldest_root = False - dating_method = InsideOutsideMethod(tree_sequence, **kwargs) + dating_method = InsideOutsideMethod( + tree_sequence, + mutation_rate=mutation_rate, + population_size=population_size, + priors=priors, + **kwargs, + ) result = dating_method.run( eps=eps, num_threads=num_threads, @@ -1489,16 +1563,21 @@ def inside_outside( def variational_gamma( tree_sequence, *, + mutation_rate, eps=None, max_iterations=None, - max_shape=None, rescaling_intervals=None, - match_central_moments=None, # undocumented - match_segregating_sites=None, # undocumented - regularise_roots=None, # undocumented + # deliberately undocumented parameters below. We may eventually document these + max_shape=None, + match_central_moments=None, + match_segregating_sites=None, + regularise_roots=None, **kwargs, ): """ + variational_gamma(tree_sequence, *, mutation_rate, eps=None, max_iterations=None,\ + rescaling_intervals=None, **kwargs) + Infer dates for nodes in a tree sequence using expectation propagation, which approximates the marginal posterior distribution of a given node's age with a gamma distribution. Convergence to the correct posterior moments @@ -1509,26 +1588,24 @@ def variational_gamma( new_ts = tsdate.variational_gamma(ts, mutation_rate=1e-8, max_iterations=10) - An piecewise-constant uniform distribution is used as a prior for each + A piecewise-constant uniform distribution is used as a prior for each node, that is updated via expectation maximization in each iteration. Node-specific priors are not currently supported. :param ~tskit.TreeSequence tree_sequence: The input tree sequence to be dated. + :param float mutation_rate: The estimated mutation rate per unit of genome per + unit time. :param float eps: The minimum distance separating parent and child ages in the returned tree sequence. Default: None, treated as 1e-6 :param int max_iterations: The number of iterations used in the expectation propagation algorithm. Default: None, treated as 10. - :param float max_shape: The maximum value for the shape parameter in the variational - posteriors. This is equivalent to the maximum precision (inverse variance) on a - logarithmic scale. Default: None, treated as 1000. :param float rescaling_intervals: For time rescaling, the number of time intervals within which to estimate a rescaling parameter. Default None, treated as 1000. :param \\**kwargs: Other keyword arguments as described in the :func:`date` wrapper - function, notably ``mutation_rate``, and ``population_size`` or ``priors``. - Further arguments include ``time_units``, ``progress``, and - ``record_provenance``. The additional arguments ``return_posteriors`` and - ``return_likelihood`` can be used to return additional information (see below). + function, including ``time_units``, ``progress``, and ``record_provenance``. + The arguments ``return_posteriors`` and ``return_likelihood`` can be + used to return additional information (see below). :return: - **ts** (:class:`~tskit.TreeSequence`) -- a copy of the input tree sequence with updated node times based on the posterior mean, corrected where necessary to @@ -1552,6 +1629,8 @@ def variational_gamma( if max_iterations is None: max_iterations = 10 if max_shape is None: + # The maximum value for the shape parameter in the variational posteriors. + # Equivalent to the maximum precision (inverse variance) on a logarithmic scale. max_shape = 1000 if rescaling_intervals is None: rescaling_intervals = 1000 @@ -1562,7 +1641,9 @@ def variational_gamma( if regularise_roots is None: regularise_roots = True - dating_method = VariationalGammaMethod(tree_sequence, **kwargs) + dating_method = VariationalGammaMethod( + tree_sequence, mutation_rate=mutation_rate, **kwargs + ) result = dating_method.run( eps=eps, max_iterations=max_iterations, @@ -1576,38 +1657,34 @@ def variational_gamma( estimation_methods = { + "variational_gamma": variational_gamma, "inside_outside": inside_outside, "maximization": maximization, - "variational_gamma": variational_gamma, } """ The names of available estimation methods, each mapped to a function to carry out the appropriate method. Names can be passed as strings to the :func:`~tsdate.date` function, or each named function can be called directly: +* :func:`tsdate.variational_gamma`: variational approximation, empirically most accurate. * :func:`tsdate.inside_outside`: empirically better, theoretically problematic. * :func:`tsdate.maximization`: worse empirically, especially with gamma approximated priors, but theoretically robust -* :func:`tsdate.variational_gamma`: variational approximation, empirically most accurate. """ def date( tree_sequence, + *, mutation_rate, - population_size=None, recombination_rate=None, time_units=None, - priors=None, method=None, - *, constr_iterations=None, return_posteriors=None, return_likelihood=None, progress=None, record_provenance=True, - # Deprecated params - Ne=None, # Other kwargs documented in the functions for each specific estimation-method **kwargs, ): @@ -1634,20 +1711,8 @@ def date( :param ~tskit.TreeSequence tree_sequence: The input tree sequence to be dated (for example one with :data:`uncalibrated` node times). - :param float or ~demography.PopulationSizeHistory population_size: The estimated - (diploid) effective population size used to construct the (default) conditional - coalescent prior. For a population with constant size, this can be given as a - single value (for example, as commonly estimated by the observed genetic - diversity of the sample divided by four-times the expected mutation rate). - Alternatively, for a population with time-varying size, this can be given - directly as a :class:`~demography.PopulationSizeHistory` object or a parameter - dictionary passed to initialise a :class:`~demography.PopulationSizeHistory` - object. The ``population_size`` parameter is only used when ``priors`` is - ``None``. Conversely, if ``priors`` is not ``None``, no ``population_size`` - value should be specified. :param float mutation_rate: The estimated mutation rate per unit of genome per - unit time. If provided, the dating algorithm will use a mutation rate clock to - help estimate node dates. Default: ``None`` + unit time (see individual methods) :param float recombination_rate: The estimated recombination rate per unit of genome per unit time. If provided, the dating algorithm will use a recombination rate clock to help estimate node dates. Default: ``None`` (not currently implemented) @@ -1660,10 +1725,6 @@ def date( and are using the conditional coalescent prior, the ``population_size`` value which you provide must be scaled by multiplying by the number of years per generation. If ``None`` (default), assume ``"generations"``. - :param tsdate.base.NodeGridValues priors: NodeGridValues object containing the prior - parameters for each node-to-be-dated. Note that different estimation methods may - require different types of prior, as described in the documentation for each - estimation method. :param string method: What estimation method to use. See :data:`~tsdate.core.estimation_methods` for possible values. If ``None`` (default) the "inside_outside" method is currently chosen. @@ -1692,13 +1753,6 @@ def date( marginal likelihood given the mutations on the tree sequence. """ # Only the .date() wrapper needs to consider the deprecated "Ne" param - if Ne is not None: - if population_size is not None: - raise ValueError( - "Only one of Ne (deprecated) or population_size may be specified" - ) - else: - population_size = Ne if method is None: method = "inside_outside" # may change later if method not in estimation_methods: @@ -1706,11 +1760,9 @@ def date( return estimation_methods[method]( tree_sequence, - population_size=population_size, mutation_rate=mutation_rate, recombination_rate=recombination_rate, time_units=time_units, - priors=priors, progress=progress, constr_iterations=constr_iterations, return_posteriors=return_posteriors,