From 9923f7ca558cc5f1098e43f7ca566ce5edfb9fbd Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Fri, 3 Nov 2023 17:37:16 +0000 Subject: [PATCH 1/5] Restructure all docs into JupyterBook format And fix some internal references --- .mergify.yml | 44 +++++ CHANGELOG.md | 115 +++++++++++++ CHANGELOG.rst | 136 --------------- docs/CHANGELOG.md | 1 + docs/Makefile | 31 ++-- docs/_config.yml | 87 ++++++++++ docs/_toc.yml | 17 ++ docs/api.md | 28 ++++ docs/bibliography.md | 20 +++ docs/build.sh | 20 +++ docs/cli.md | 50 ++++++ docs/cli.rst | 32 ---- docs/conf.py | 100 ----------- docs/data/Makefile | 10 ++ docs/data/basic_example.trees | Bin 0 -> 10132 bytes docs/data/make_examples.py | 13 ++ docs/historical_samples.md | 112 +++++++++++++ docs/index.md | 53 ++++++ docs/index.rst | 26 --- docs/installation.md | 37 +++++ docs/installation.rst | 25 --- docs/introduction.md | 94 +++++++++++ docs/introduction.rst | 18 -- docs/make.bat | 35 ---- docs/methods.md | 133 +++++++++++++++ docs/priors.md | 107 ++++++++++++ docs/python-api.md | 55 ++++++ docs/python-api.rst | 37 ----- docs/references.bib | 39 +++++ docs/requirements.txt | 17 +- docs/tutorial.rst | 218 ------------------------ docs/usage.md | 304 ++++++++++++++++++++++++++++++++++ docs/variable_popsize.md | 103 ++++++++++++ tsdate/base.py | 10 +- tsdate/demography.py | 3 +- tsdate/prior.py | 25 +-- tsdate/util.py | 6 +- 37 files changed, 1494 insertions(+), 667 deletions(-) create mode 100644 .mergify.yml create mode 100644 CHANGELOG.md delete mode 100644 CHANGELOG.rst create mode 120000 docs/CHANGELOG.md create mode 100644 docs/_config.yml create mode 100644 docs/_toc.yml create mode 100644 docs/api.md create mode 100644 docs/bibliography.md create mode 100755 docs/build.sh create mode 100644 docs/cli.md delete mode 100644 docs/cli.rst delete mode 100644 docs/conf.py create mode 100644 docs/data/Makefile create mode 100644 docs/data/basic_example.trees create mode 100644 docs/data/make_examples.py create mode 100644 docs/historical_samples.md create mode 100644 docs/index.md delete mode 100644 docs/index.rst create mode 100644 docs/installation.md delete mode 100644 docs/installation.rst create mode 100644 docs/introduction.md delete mode 100644 docs/introduction.rst delete mode 100644 docs/make.bat create mode 100644 docs/methods.md create mode 100644 docs/priors.md create mode 100644 docs/python-api.md delete mode 100644 docs/python-api.rst create mode 100644 docs/references.bib delete mode 100644 docs/tutorial.rst create mode 100644 docs/usage.md create mode 100644 docs/variable_popsize.md diff --git a/.mergify.yml b/.mergify.yml new file mode 100644 index 00000000..1a632e16 --- /dev/null +++ b/.mergify.yml @@ -0,0 +1,44 @@ +queue_rules: + - name: default + conditions: + - "#approved-reviews-by>=1" + - "#changes-requested-reviews-by=0" + - status-success=Lint + - status-success=Python (3.8, macos-latest) + - status-success=Python (3.10, macos-latest) + - status-success=Python (3.8, ubuntu-latest) + - status-success=Python (3.10, ubuntu-latest) + - status-success=Python (3.8, windows-latest) + - status-success=Python (3.10, windows-latest) + - "status-success=ci/circleci: build" + +pull_request_rules: + - name: Automatic rebase, CI and merge + conditions: + - "-merged" + - "#approved-reviews-by>=1" + - "#changes-requested-reviews-by=0" + - base=main + - label=AUTOMERGE-REQUESTED + - status-success=Lint + - status-success=Python (3.8, macos-latest) + - status-success=Python (3.10, macos-latest) + - status-success=Python (3.8, ubuntu-latest) + - status-success=Python (3.10, ubuntu-latest) + - status-success=Python (3.8, windows-latest) + - status-success=Python (3.10, windows-latest) + - "status-success=ci/circleci: build" + actions: + queue: + name: default + method: rebase + update_method: rebase + + - name: Remove label after merge + conditions: + - merged + - label=AUTOMERGE-REQUESTED + actions: + label: + remove: + - AUTOMERGE-REQUESTED \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..2fa6ef66 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,115 @@ +# Changelog + +## [0.1.6] - ****-**-** + +**Breaking changes** + +- The standalone `preprocess_ts` function now defaults to not removing unreferenced + individuals, populations, or sites, aiming to change the tree sequence tables as + little as possible. + +- Not strictly breaking, as not in the published API, but the "normalize" flag + in `get_dates` and the internal `normalize` terminology is changed to + `standardize` to better reflect the fact that the maximum (not sum) is one. + +- The `Ne` argument to `date` has been deprecated (although it is + still in the API for backward compatibility). The equivalent argument + `population_size` should be used instead. + +- The CLI `-verbosity` flag no longer takes a number, but uses + `action="count"`, so `-v` turns verbosity to INFO level, + whereas `-vv` turns verbosity to DEBUG level. + +**Features** + +- Priors may be calculated using a piecewise-constant effective population trajectory, + which is implemented in the `demography.PopulationSizeHistory` class. The + `population_size` argument to `date` accepts either a single scalar effective + population size, or a `PopulationSizeHistory` instance. + +**Bugfixes** + +- The returned posteriors when `return_posteriors=True` now return actual + probabilities (scaled so that they sum to one) rather than standardized + "probabilities" whose maximum value is one. + +- The population size is saved in provenance metadata when possible (e.g. if it is + a single number) + +- `preprocess_ts` always records provenance as being from the `preprocess_ts` + command, even if no gaps are removed. The command now has a (rarely used) + `delete_intervals` parameter, which is normally filled out and saved in provenance + (as it was before). If no gap deletion is done, the param is saved as `[]` + + +## [0.1.5] - 2022-06-07 + +**Features** + +- Added the `time_units` parameter to `tsdate.date`, allowing users to specify + the time units of the dated tree sequence. Default is `"generations"`. +- Added the `return_posteriors` parameter to `tsdate.date`. If True, the function + returns a tuple of `(dated_ts, posteriors)`. +- `mutation_rate` is now a required argument in `tsdate.date` and `tsdate.get_dates` +- tsdate returns an error if users attempt to date an unsimplified tree sequence. +- Updated tsdate citation information to cite the recent Science paper +- Built wheel on Python 3.10 + + +## [0.1.4] - 2021-06-30 + +**Features** + +- The algorithm now operates completely in unscaled time (in units of generations) under + the hood, which means that `tsdate.build_prior_grid` now requires the parameter + `Ne`. +- Users now have access to the marginal posterior distributions on node age by running + `tsdate.get_dates`, though this is undocumented for now. + +**Bugfixes** + +- A fix to the way likelihoods are added should eliminate numerical errors that are + sometimes encountered when dating very large tree sequences. + + +## [0.1.3] - 2021-02-15 + +**Features** + +- Two new methods, `tsdate.sites_time_from_ts` and `tsdate.add_sampledata_times`, + support inference of tree sequences from non-contemporaneous samples. +- New tutorial on inferring tree sequences from modern and historic/ancient samples + explains how to use these functions in conjunction with `tsinfer`. +- `tsdate.preprocess_ts` supports dating inferred tree sequences which include large, + uninformative stretches (i.e. centromeres and telomeres). Simply run this function + on the tree sequence before dating it. +- `ignore_outside` is a new parameter in the outside pass which tells `tsdate` to + ignore edges from oldest root (these edges are often of low quality in `tsinfer` + inferred tree sequences) +- Development environment is now equivalent to other `tskit-dev` projects + + +## [0.1.2] - 2020-02-28 + +- Improve user experience with more progress bars and logging. +- Slightly change traversal method in outside and outside maximization algorithms, + this should only affect inference on inferred tree sequences with large numbers + of nodes at the same frequency. +- Improve reporting of current project version +- Use appdirs for default caching location +- Prevent dating tree sequences with dangling nodes + + + +## [0.1.1] - 2020-02-25 + +- Bugfix release: resolve issue with precalculating prior values. + + +€# [0.1.0] - 2020-02-24 + +Early Alpha release made available via PyPI for community testing and evaluation. + +Please don't use this version in published works. + + diff --git a/CHANGELOG.rst b/CHANGELOG.rst deleted file mode 100644 index a4cf88e3..00000000 --- a/CHANGELOG.rst +++ /dev/null @@ -1,136 +0,0 @@ --------------------- -[0.1.6] - ****-**-** --------------------- - -**Breaking changes** - -- The standalone ``preprocess_ts`` function now defaults to not removing unreferenced - individuals, populations, or sites, aiming to change the tree sequence tables as - little as possible. :user:`hyanwong` - -- Not strictly breaking, as not in the published API, but the "normalize" flag - in ``get_dates`` and the internal ``normalize`` terminology is changed to - ``standardize`` to better reflect the fact that the maximum (not sum) is one. - :user:`hyanwong` - -- The ``Ne`` argument to ``date`` has been deprecated (although it is - still in the API for backward compatibility). The equivalent argument - ``population_size`` should be used instead. - -- The CLI ``-verbosity`` flag no longer takes a number, but uses - ``action="count"``, so ``-v`` turns verbosity to INFO level, - whereas ``-vv`` turns verbosity to DEBUG level. :user:`hyanwong` - -**Features** - -- Priors may be calculated using a piecewise-constant effective population trajectory, - which is implemented in the ``demography.PopulationSizeHistory`` class. The - ``population_size`` argument to ``date`` accepts either a single scalar effective - population size, or a ``PopulationSizeHistory`` instance. -- A new fast method, ``variational_gamma``, approximates the posterior distribution of - times on each node by a gamma distribution, rather than using discrete time bins. - When combined with the iterative expectation propagation algorithm (see next feature) - this gives more accurate results, especially for older timepoints. :user:`nspope` -- An expectation propagation (message passing) algorithm, currently implemented only - for the ``variational_gamma`` method, properly accounts for the problem of loopy - belief propagation: running the algorithm iteratively results in convergence to - a more accurate estimate of the true posterior. Lack of convergence or other - problems when dating are likely to indicate a pathological combination of tree - topologies and mutations. :user:`nspope` - -**Bugfixes** - -- The returned posteriors when ``return_posteriors=True`` now return actual - probabilities (scaled so that they sum to one) rather than standardized - "probabilities" whose maximum value is one. - -- The population size is saved in provenance metadata when possible (e.g. if it is - a single number). :user:`hyanwong` - -- ``preprocess_ts`` always records provenance as being from the ``preprocess_ts`` - command, even if no gaps are removed. The command now has a (rarely used) - `delete_intervals` parameter, which is normally filled out and saved in provenance - (as it was before). If no gap deletion is done, the param is saved as ``[]`` - :user:`hyanwong` - --------------------- -[0.1.5] - 2022-06-07 --------------------- - -**Features** - -- Added the ``time_units`` parameter to ``tsdate.date``, allowing users to specify - the time units of the dated tree sequence. Default is ``"generations"``. -- Added the ``return_posteriors`` parameter to ``tsdate.date``. If True, the function - returns a tuple of ``(dated_ts, posteriors)``. -- ``mutation_rate`` is now a required argument in ``tsdate.date`` and ``tsdate.get_dates`` -- tsdate returns an error if users attempt to date an unsimplified tree sequence. -- Updated tsdate citation information to cite the recent Science paper -- Built wheel on Python 3.10 - - --------------------- -[0.1.4] - 2021-06-30 --------------------- - -**Features** - -- The algorithm now operates completely in unscaled time (in units of generations) under - the hood, which means that ``tsdate.build_prior_grid`` now requires the parameter - ``Ne``. -- Users now have access to the marginal posterior distributions on node age by running - ``tsdate.get_dates``, though this is undocumented for now. - -**Bugfixes** - -- A fix to the way likelihoods are added should eliminate numerical errors that are - sometimes encountered when dating very large tree sequences. - --------------------- -[0.1.3] - 2021-02-15 --------------------- - -**Features** - -- Two new methods, ``tsdate.sites_time_from_ts`` and ``tsdate.add_sampledata_times``, - support inference of tree sequences from non-contemporaneous samples. -- New tutorial on inferring tree sequences from modern and historic/ancient samples - explains how to use these functions in conjunction with ``tsinfer``. -- ``tsdate.preprocess_ts`` supports dating inferred tree sequences which include large, - uninformative stretches (i.e. centromeres and telomeres). Simply run this function - on the tree sequence before dating it. -- ``ignore_outside`` is a new parameter in the outside pass which tells ``tsdate`` to - ignore edges from oldest root (these edges are often of low quality in ``tsinfer`` - inferred tree sequences) -- Development environment is now equivalent to other ``tskit-dev`` projects - - --------------------- -[0.1.2] - 2020-02-28 --------------------- - -- Improve user experience with more progress bars and logging. -- Slightly change traversal method in outside and outside maximization algorithms, - this should only affect inference on inferred tree sequences with large numbers - of nodes at the same frequency. -- Improve reporting of current project version -- Use appdirs for default caching location -- Prevent dating tree sequences with dangling nodes - - --------------------- -[0.1.1] - 2020-02-25 --------------------- - -Bugfix release: resolve issue with precalculating prior values. - - --------------------- -[0.1.0] - 2020-02-24 --------------------- - -Early Alpha release made available via PyPI for community testing and evaluation. - -Please don't use this version in published works. - - diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md new file mode 120000 index 00000000..04c99a55 --- /dev/null +++ b/docs/CHANGELOG.md @@ -0,0 +1 @@ +../CHANGELOG.md \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index c01001ff..61dc187d 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,24 +1,19 @@ -# Minimal makefile for Sphinx documentation -# +# Need to set PYTHONPATH so that we pick up the local tsdate +PYPATH=$(shell pwd)/../ +TSDATE_VERSION:=$(shell PYTHONPATH=${PYPATH} \ + python3 -c 'import tsdate; print(tsdate.__version__.split("+")[0])') -# You can set these variables from the command line. -SPHINXOPTS = #-W -SPHINXBUILD = python3 -m sphinx -SOURCEDIR = . BUILDDIR = _build -all: - @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +all: dev -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +dev: + PYTHONPATH=${PYPATH} ./build.sh -clean: - rm -rf $(BUILDDIR) - -.PHONY: help Makefile +dist: + @echo Building distribution for tsdate version ${TSDATE_VERSION} + sed -i -e s/__TSDATE_VERSION__/${TSDATE_VERSION}/g _config.yml + PYTHONPATH=${PYPATH} ./build.sh -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file +clean: + rm -fR $(BUILDDIR) \ No newline at end of file diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 00000000..1e0181ac --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,87 @@ +# Book settings +# Learn more at https://jupyterbook.org/customize/config.html + +title: Tsdate manual +author: Tskit Developers +copyright: "2023" +only_build_toc_files: true +logo: tsdate_logo.svg + +execute: + execute_notebooks: cache + +launch_buttons: + binderhub_url: "" + +repository: + url: https://github.com/tskit-dev/tsdate + branch: main + path_to_book: docs + +bibtex_bibfiles: + - references.bib + +html: + use_issues_button: true + use_repository_button: true + use_edit_page_button: true + # Do not edit this - the version placeholder is replaced by the + # current version during a distribution build in the Makefile + extra_navbar: tsdate __TSDATE_VERSION__ + extra_footer: tsdate __TSDATE_VERSION__ + + +sphinx: + extra_extensions: + - sphinx.ext.autodoc + - sphinx.ext.autosummary + - sphinx.ext.todo + - sphinx.ext.viewcode + - sphinx.ext.intersphinx + - sphinx_issues + - sphinxarg.ext + - IPython.sphinxext.ipython_console_highlighting + + config: + html_theme: sphinx_book_theme + html_theme_options: + pygment_dark_style: monokai + pygments_style: monokai + bibtex_reference_style: author_year + myst_enable_extensions: + - colon_fence + - deflist + - dollarmath + issues_github_path: tskit-dev/tsdate + todo_include_todos: true + intersphinx_mapping: + python: ["https://docs.python.org/3/", null] + tskit: ["https://tskit.dev/tskit/docs/stable", null] + tsinfer: ["https://tskit.dev/tsinfer/docs/stable", null] + msprime: ["https://tskit.dev/msprime/docs/stable", null] + tutorials: ["https://tskit.dev/tutorials/", null] + numpy: ["https://numpy.org/doc/stable/", null] + nitpicky: true + + autodoc_member_order: bysource + + # Without this option, autodoc tries to put links for all return types + # in terms of the fully-qualified classnames + # (e.g. msprime.demography.Demography) which we don't want, and also + # leads to broken links and nitpick failures. So, until we tackle + # typehints fully, this is the simplest approach. + autodoc_typehints: none + + # Note we have to use the regex version here because of + # https://github.com/sphinx-doc/sphinx/issues/9748 + nitpick_ignore_regex: [ + [ "py:class", "arraylike" ], + [ "py:class", "array_like" ], + [ "py:class", "array" ], + [ "py:class", "dtype=float64" ], + [ "py:class", "dtype=np.float64"], + [ "py:class", "dtype=uint32" ], + [ "py:class", "dtype=int8" ], + [ "py:class", "iter" ], + [ "py:class", "string" ], + ] \ No newline at end of file diff --git a/docs/_toc.yml b/docs/_toc.yml new file mode 100644 index 00000000..6363c312 --- /dev/null +++ b/docs/_toc.yml @@ -0,0 +1,17 @@ +format: jb-book +root: index +chapters: +- file: introduction +- file: installation +- file: usage + sections: + - file: methods + - file: priors + - file: historical_samples + - file: variable_popsize +- file: api + sections: + - file: python-api + - file: cli +- file: bibliography +- file: CHANGELOG \ No newline at end of file diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 00000000..badfe332 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,28 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_api)= + +# APIs + +`Tsdate` provides two APIs + +* A [Python API](sec_python_api) and +* A [command line interface](sec_cli). + +The Python API is more flexible, but the CLI can be useful for dating +large tree sequences outside of a interactive python session, for running `tsdate` on a +remote server, etc. Both are formally documented in the following pages. diff --git a/docs/bibliography.md b/docs/bibliography.md new file mode 100644 index 00000000..41104e67 --- /dev/null +++ b/docs/bibliography.md @@ -0,0 +1,20 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +(sec_bibliography)= + +# References + +```{bibliography} +:style: plain +``` \ No newline at end of file diff --git a/docs/build.sh b/docs/build.sh new file mode 100755 index 00000000..c403f240 --- /dev/null +++ b/docs/build.sh @@ -0,0 +1,20 @@ +#/bin/bash + +# Jupyter-build doesn't have an option to automatically show the +# saved reports, which makes it difficult to debug the reasons for +# build failures in CI. This is a simple wrapper to handle that. + +REPORTDIR=_build/html/reports + +jupyter-book build -nW --keep-going . +RETVAL=$? +if [ $RETVAL -ne 0 ]; then + if [ -e $REPORTDIR ]; then + echo "Error occured; showing saved reports" + cat $REPORTDIR/* + fi +else + # Clear out any old reports + rm -f $REPORTDIR/* +fi +exit $RETVAL \ No newline at end of file diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 00000000..749c735a --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,50 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_cli)= + +# Command line interface + +This page provides formal documentation for the command-line interface to `tsdate`. + +## Example + +For a mutation rate of 1e-8 and an effective diploid population size of 20,000, the following command-line +command will create dated tree sequence named `output.trees`. + +```{code} bash +tsdate date -m 1e-8 input.trees output.trees 20000 +``` + +Alternatively, the following is more reliable, especially if you have multiple versions of Python installed or +if the {command}`tsdate` executable is not installed on your path: + +```{code} bash +python3 -m tsdate date -m 1e-8 input.trees output.trees 20000 +``` + +For long-running dating processes, the `--progress` option may also be useful. +Additional `tsdate` commands, notably {func}`preprocess_ts` are also available, see below: + +## Argument details + +```{eval-rst} +.. argparse:: + :module: tsdate.cli + :func: tsdate_cli_parser + :prog: tsdate + :nodefault: +``` \ No newline at end of file diff --git a/docs/cli.rst b/docs/cli.rst deleted file mode 100644 index ff7e2a29..00000000 --- a/docs/cli.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. _sec_cli: - -====================== -Command line interface -====================== - -``tsdate`` provides a Command Line Interface to access the basic functionality of the -:ref:`Python API `. - - -.. code-block:: bash - - $ tsdate - -or - -.. code-block:: bash - - $ python3 -m tsdate - -The second command is useful when multiple versions of Python are -installed or if the :command:`tsdate` executable is not installed on your path. - -++++++++++++++++ -Argument details -++++++++++++++++ - -.. argparse:: - :module: tsdate.cli - :func: tsdate_cli_parser - :prog: tsdate - :nodefault: diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 37dd3273..00000000 --- a/docs/conf.py +++ /dev/null @@ -1,100 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# http://www.sphinx-doc.org/en/master/config -import os -import sys - -import pkg_resources - - -autodoc_mock_imports = [ - "numpy", - "tskit", - "tqdm", - "appdirs", - "numba", - "scipy", - "scipy.stats", - "scipy.special", -] - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# - -sys.path.insert(0, os.path.abspath("..")) - -# The master document -master_doc = "index" - -# -- Project information ----------------------------------------------------- - -project = "tsdate" -copyright = "2020, University of Oxford" # NOQA: A001 -author = "Anthony Wilder Wohns and Yan Wong" - -# The full version, including alpha/beta/rc tags -try: - from setuptools_scm import get_version - - release = get_version(root="..", relative_to=__file__) - version = release[:3] -except pkg_resources.DistributionNotFound: - release = "0.0.0" - version = "0.0.0" - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.intersphinx", - "sphinxarg.ext", - "sphinx_issues", - "recommonmark", -] - -# Github repo -issues_github_path = "tskit-dev/tsdate" - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = [] - -html_logo = "tsdate_logo.svg" - -# -- Extension configuration ------------------------------------------------- - -# -- Options for intersphinx extension --------------------------------------- - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = { - "https://docs.python.org/3": None, - "https://docs.scipy.org/doc/numpy/": None, - "https://tskit.dev/tskit/docs/stable": None, -} diff --git a/docs/data/Makefile b/docs/data/Makefile new file mode 100644 index 00000000..da502be8 --- /dev/null +++ b/docs/data/Makefile @@ -0,0 +1,10 @@ +TREE_SEQS=\ + basic_example.trees \ + +all: ${TREE_SEQS} + +%.trees: + python make_examples.py + +clean: + rm -fR *.trees \ No newline at end of file diff --git a/docs/data/basic_example.trees b/docs/data/basic_example.trees new file mode 100644 index 0000000000000000000000000000000000000000..bb5b6473252affdbb815a795f95d45fbbb4c1672 GIT binary patch literal 10132 zcmeHMZ;TaJ6@QgKOHtajEiGV?VG4-NzPE4Q?ml3tGEbm>KyXtwmDb=eymxnZ%6l`f zGxL^(?G~a$tHJWYY8t)}F=dKhSpS{h%sKns z4*Qq~v8HL5?B%|D?)}|!?zz8v=APlJ+e$kZEnK$nrc5UDNh(i$jQ+WXXD*L@;nw)a z2f1zD5&u}j?RM48-0?Ew7jpYThj`eH=U0rsC2nRG+{r&us(IZ1d&VzL;M3e8`rl6A zPj#qY{I{q(R{SB>FZfRVx0LbI{=X}pC=Ji*GOH5!`jWUu=96*BTKw0`c=7*X#(y9Q zNVH}BpJ4pm-2VjQ)9e@gLyTX>cx`F?BMmS12aNwPiDW=ZU0^(iflQ}(S%2v|@xs5M>%SS;V*Q=s#eY9wJkv1KDPHvdl<{13j2HdquhV|<-ya#z zVSie`@VT8EW&i%k_>c0y!|Z>tuN_|WFS{)s!Byr|hx;e`dl)ETQ}CVY-dKh~~@4P2SB4DsJm#>@RV{g+CLF?@z&t{$q?6+Mel9zwG~a887#T z7Z@-0wKGHdf0VHQ=ZsH_A7cM23Hx7Vyx5nf{|&}V{Cu17?QkL&{g)H=3%*nRw=nT# z|FpD&!v@Iw_b^`U7krxkWE}?M<^1kp{Sr@QT)TQB<0YQd8NaF>faR((s)F9~m{YC9 zKdWjajDOWsXjkmep4qazk&!?phFC#)Ox1W~-S(Bs1AJ$6EFAItnjQAKc1`imapeb& z=Q?ghO^|wMupBo~en?Ww^DD}q(dVn0H*Qy_dz^8n(y*&R??}}i4QS}Jpz4)vOfs8{ zO>gazHO?lx%5Yw9D=J>~3|m;l^N1>`d5)Sh>PHp_y%iONEluo1OH**Oh(tqkDj9sG z64$G!mMNkQYH0^v!!NfE2%Va$HNt5VeP>)%tbhdV#A3}fNQ`MtBa*g5#qQ}jBD1Gk zv1gm}1x~9qFd5^S9`VoSUd3%OVP=aBN(f_i$HKZ-Z&V|1#BC^R%7R{*cQzT%K5JL3 zqwag-%C+4x8SkqyorEhTGE2))7(rW7tGAAx7BV1Ek2Vy|Y*m#z8jitdL2slg^yzFx zTRN*y@DV`P7mt+ItV4JchXeB#0T9-$9$81HW6FSt76J&G$3?jMN3$V+gg!21q&l$BI;DtKRN zRPLdISc%ujYAQFL_m>C25fpF-1)M?wUr@j$6mSRy96|wCP{1eQ6nF(b38%m-jSF}s z+ycMAE8!OS1#Y3>dLvhy!$)!@YG!WK>3#@g7c2pfjENL zg1CZy!~nzyi6IhO5PJ{<5PJ|)5Kj;Tge$@k;f8QRxF8(R2qXJ0`z?EoeTNKuz!;4R zI0G!Wgt@>m%nj~A7dQ#s;3{l^O|T6%!dBP}+YukJ7Q{uY4Qs?&v1Y6tK7cRa6Zi%` zdOtjyH*YR&E|p4H3)XbCeZQ9V!q0d#Dwq?#TuBA~M*W|ZhlX6$o;|0#{o-%N%-Ul| zGnWpnGEbfSOJnuL?dAiQf3aohaLJs%y>RfY<3;nSOLOvF&);t@`TF_4UwH08^JicA z(kr*0*kNL=|H}e<-;GCXgb!tJID)0EMW`D4B&hpzRz*i*IEnsfiO zsQBcud(9;ePi)_HqSyTX@!IQ;A2ZA^=l=YyH5Ug=_w3ZF#lPHWj?R7UwcU$0ncG$# z8JKh5r_2iSz)(3UHM@K$PzbG59^7>SZ8xdzFb-PlDF3fgy$oJ|9wHlQ!ar18JGfLrbtLR9yutaF9i{-?A2FySr2N*S zoVq@CfFxKfESQ=5L__mvF(DtTN&Yma%y#?j>}Kv3)rsPk#~3tIl5E_M@{+Gwid2(Bp&B0*H)?hDZ0`NLfpF zJ~xmXh`L7u&}6bzSs}MhpQ7jgeL|B@UecrbndetWwQH=2>VNF%iR#RmxX%5xr-Zi*e;)=kW8??& hsxM#68vO&stg5UlXA7#lH(Rc(x7X!2^cfYk{=aPKrz-#e literal 0 HcmV?d00001 diff --git a/docs/data/make_examples.py b/docs/data/make_examples.py new file mode 100644 index 00000000..a785eb3d --- /dev/null +++ b/docs/data/make_examples.py @@ -0,0 +1,13 @@ +import msprime + + +def basic_example(): + ts = msprime.sim_ancestry( + 10, population_size=100, sequence_length=1e6, random_seed=123 + ) + ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=4321) + ts.dump("basic_example.trees") + + +if __name__ == "__main__": + basic_example() diff --git a/docs/historical_samples.md b/docs/historical_samples.md new file mode 100644 index 00000000..e0e7cb1f --- /dev/null +++ b/docs/historical_samples.md @@ -0,0 +1,112 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + + +(sec_historical_samples)= + +# Historical (Ancient) Samples + +Sometimes we wich to infer and date a genetic genealogy from +data which includes *historical samples*, +whose time is older that the current generation (i.e. sample with +node times > 0). + +The output of {ref}`tsinfer:sec_introduction` is valid regardless +of the inclusion of historical samples, but *dating* such a tree sequence +is more complicated. This is because the time scale of a tsinferred +tree sequence is uncalibrated, so it is unclear where in time to +place any historical samples. + +## The 2 step approach + +Currently, the best way to date such tree sequences is +to carry out a two step process, in which inference and dating is first +performed on the modern samples in order to establish a timescale, followed by +adding the historical samples and re-inferring using the dated timescale as a +basis for site ages in the `tsinfer` algorithm. The re-inferred tree sequence +can then be redated. The following recipe shows how this is accomplished +with a few lines of Python. The only requirement is a tsinfer.SampleData file with +modern and historical samples (the latter are specified using the +`individuals_time` array in a tsinfer.SampleData file). + +:::{note} +While `tsinfer` does not make assumptions about the +time of sample nodes, `tsdate` requires a [prior](sec_priors) +which is calculated from the conditional coalescent on the assumption that all +samples are at time 0. Although this is not the case if there are historical samples, +testing shows that it is still a close enough approximation for the method to +work well. +::: + +```{code-cell} ipython3 +import msprime +import tsinfer +import tsdate + +Ne = 1e4 +mu = 1e-7 # mutation rate + +def make_historical_data(num_modern, num_historical, random_seed, historical_time = 10000): + # Returns two SampleData files, one with only moderns and one with everyone + samples = [ + msprime.SampleSet(num_modern), + msprime.SampleSet(num_historical, time=historical_time) + ] + + ts = msprime.sim_ancestry( + samples=samples, + population_size=Ne, + recombination_rate=1e-7, + sequence_length=1e5, + random_seed=random_seed + ) + ts = msprime.sim_mutations(ts, rate=mu, random_seed=random_seed) + ## Make the SampleData files directly from the tree sequence rather than going via VCF + all_samples = tsinfer.SampleData.from_tree_sequence( + ts, use_sites_time=True, use_individuals_time=True) + modern_samples = tsinfer.SampleData.from_tree_sequence( + ts.simplify(ts.samples(time=0), filter_sites=False)) + + # NB: if importing from a VCF you would create the modern_samples by subsetting all_samples: + # modern_samples = all_samples.subset(np.where(all_samples.individuals_time[:] == 0)[0]) + return modern_samples, all_samples + +def infer_and_date_modern_and_ancient(modern_samples, all_samples): + # Date the moderns + inferred_ts = tsinfer.infer(modern_samples) + inferred_ts = tsdate.preprocess_ts(inferred_ts, filter_sites=False) + dated_modern_ts = tsdate.date(inferred_ts, population_size=Ne, mutation_rate=mu) + + # Find the dates for each site and use those rather than frequency + sites_time = tsdate.sites_time_from_ts(dated_modern_ts) # Get tsdate site age estimates + dated_samples = tsdate.add_sampledata_times(all_samples, sites_time) + ancestors = tsinfer.generate_ancestors(dated_samples) + ancestors_w_proxy = ancestors.insert_proxy_samples( + dated_samples, allow_mutation=True) + ancestors_ts = tsinfer.match_ancestors(dated_samples, ancestors_w_proxy) + return tsinfer.match_samples(dated_samples, ancestors_ts, force_sample_times=True) + +dated_ts = infer_and_date_modern_and_ancient(*make_historical_data(2, 1, random_seed=1234)) +``` + +In the code above, we simulate a tree sequence with six sample chromosomes, four modern and +two historical. We then infer and date a tree sequence using only the modern +samples. Next, we find derived alleles which are carried by the historical samples and +use the age of the historical samples to constrain the ages of these alleles. Finally, +we reinfer the tree sequence, using the date estimates from tsdate and the historical +constraints rather than the frequency of the alleles to order mutations in `tsinfer`. +Historical samples are added to the ancestors tree sequence as +{meth}`proxy nodes, in addition to being used as samples`. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..927b3ea8 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,53 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_welcome)= + +# Welcome to tsdate + +This is the documentation for `tsdate`, a method for efficiently inferring the +age of ancestors in a genetic genealogy or "[ARG](https://tskit.dev/tutorials/args.html)". + +Basic usage is as simple as running the following python command + +```{code-cell} ipython3 +:tags: [remove-cell] +import tskit +input_ts = tskit.load("data/basic_example.trees") +``` + +```{code-cell} ipython3 +import tsdate +output_ts = tsdate.date(input_ts, population_size=100, mutation_rate=1e-8) +``` + +The rest of this documentation is organised into the following sections: + +```{tableofcontents} +``` + +## Citing + +The algorithm is described in [this Science paper](https://www.science.org/doi/10.1126/science.abi8264) +(preprint [here](https://www.biorxiv.org/content/10.1101/2021.02.16.431497v2)). We also provide +evaluations of the accuracy and computational requirements of the method using both simulated and real +data; the code to reproduce these results can be found in +[another repository](https://github.com/awohns/unified_genealogy_paper). + +Please cite the following paper if you use `tsdate` in published work: + +> Anthony Wilder Wohns, Yan Wong, Ben Jeffery, Ali Akbari, Swapan Mallick, Ron Pinhasi, Nick Patterson, David Reich, Jerome Kelleher, and Gil McVean (2022) *A unified genealogy of modern and ancient genomes*. Science **375**: eabi8264; doi: https://doi.org/10.1126/science.abi8264 + diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 482c31e7..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. currentmodule:: tsdate -.. tsdate documentation master file, created by Wilder Wohns - -Welcome to tsdate's documentation -================================== - -This is the documentation for ``tsdate``, a method for efficiently inferring the -age of ancestors in a tree sequence. - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - introduction - installation - tutorial - python-api - cli - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`search` diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..fac8de3f --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,37 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_installation)= + +# Installation + +To install `tsdate` simply run: + + $ python3 -m pip install tsdate --user + +Python 3.7, or a more recent version, is required. The software has been tested +on MacOSX and Linux. + +Once installed, `tsdate`'s {ref}`Command Line Interface (CLI) ` can be accessed via: + + $ python3 -m tsdate + +or + + $ tsdate + +Alternatively, the {ref}`Python API ` allows more fine-grained control +of the inference process. diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index e475821b..00000000 --- a/docs/installation.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. _sec_installation: - -############ -Installation -############ - - -To install ``tsdate`` simply run:: - - $ python3 -m pip install tsdate --user - -Python 3.5, or a more recent version, is required. The software has been tested -on MacOSX and Linux. - -Once installed, ``tsdate``'s :ref:`Command Line Interface (CLI) ` can be accessed via: -:: - - $ python3 -m tsdate - -or :: - - $ tsdate - -Alternatively, the :ref:`Python API ` allows more fine-grained control -of the inference process. diff --git a/docs/introduction.md b/docs/introduction.md new file mode 100644 index 00000000..729b49ab --- /dev/null +++ b/docs/introduction.md @@ -0,0 +1,94 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_introduction)= + +# Introduction + +`Tsdate` {cite}`wohns2022unified` infers dates for nodes in a genetic genealogy. More precisely, it takes a +[tree sequence](https://tskit.dev/tutorials/what_is.html) as input +and returns a copy of that tree sequence with altered node times. These +times have been estimated on the basis of the number of mutations +along the edges connecting genomes in the genealogy (i.e. using the "molecular clock"). + +## Technical details + +Methodologically, the genealogy is treated as a interconnected graph, and a +Bayesian network approach is used to update the probability distribution +of times for each node, given the time distribution on connected nodes and +the mutations on connected edges. This results in a posterior distribution of +times (which can be {ref}`output separately`). This +{ref}`scales well` to large genetic genealogies. +The input tree sequence can come from any source: e.g. from simulation or from +a variety of inference programs, such as [tsinfer](https://tskit.dev/). + +As the approach is Bayesian, it requires a +{ref}`prior distribution` to be defined +for each of the nodes to date. By default, `tsdate` calculates priors from the +[conditional coalescent](http://dx.doi.org/10.1006/tpbi.1998.1411), although +alternative prior distributions can also be specified. + +`Tsdate` provides several methods for assigning probabilities to different times, +and updating information through the genealogy. These include discrete-time and +continuous-time methods, see {ref}`sec_methods` for more details. + +The output of tsdate is a new tree sequence with altered +{attr}`node times`, extra node {ref}`sec_metadata`, and +(optionally) a posterior distribution of node times +(see {ref}`sec_usage_posterior`). + +## Sources of genealogies + +The input genealogies to date can come from any source, +but `tsdate` is often coupled with [tsinfer](https://tskit.dev/software/tsinfer.html) +{cite}`kelleher2019inferring`, which estimates the tree sequence topology but +not the tree sequence node times. + +```{code-cell} ipython3 +:tags: [remove-input] +import msprime +import tsinfer +import tsdate + +from IPython.display import HTML +ts = msprime.sim_ancestry(5, population_size=1000, recombination_rate=1e-7, sequence_length=1000, random_seed=321) +ts = msprime.sim_mutations(ts, rate=4e-6, random_seed=4321) +inferred = tsinfer.infer(tsinfer.SampleData.from_tree_sequence(ts)).simplify() +svg1 = inferred.draw_svg( + size=(260, 300), mutation_labels={}, node_labels={}, y_axis=True, y_ticks=[0, 1], + style=".x-axis .ticks .lab {font-size: 0.7em} .y-axis .ticks .lab {text-anchor: middle; transform: rotate(-90deg) translateY(-10px)}", + symbol_size=4, + ) +svg2 = tsdate.date(inferred, population_size=1000, mutation_rate=4e-6).draw_svg( + size=(260, 300), mutation_labels={}, node_labels={}, y_axis=True, + y_ticks=[0, 1000, 2000, 3000], + style=".x-axis .ticks .lab {font-size: 0.7em} .y-axis .ticks .lab {text-anchor: middle; transform: rotate(-90deg) translateY(-10px)}", + symbol_size=4, +) +haps = "
".join(ts.haplotypes()) +cen = 'style="text-align: center; padding: 0.5em 0"' +HTML(f""" + + + +
An example of using `tsinfer` followed by `tsdate` on some DNA sequence data. + You can see that tsdate sets a timescale and changes node times so that mutations (red crosses) + are more evenly distributed over edges of the genealogy. This results in more realistic local trees + (with coaleascences clustered, as expected from theory, at recent times)
{haps}
{svg1}{svg2}
DNA sequencetsinfertsdate
""" +) +``` + +Together, `tsdate` and `tsinfer` scale to analyses of millions of genomes, the largest genomic datasets currently available. diff --git a/docs/introduction.rst b/docs/introduction.rst deleted file mode 100644 index c349eb21..00000000 --- a/docs/introduction.rst +++ /dev/null @@ -1,18 +0,0 @@ -.. _sec_introduction: - -============ -Introduction -============ - -``tsdate`` is a scalable method for estimating the age of ancestral nodes in a -`tree sequence `_. The method uses a coalescent prior and updates node times on the basis of the number of mutations along each edge of the tree sequence (i.e. using the "molecular clock"). - -The method is designed to operate on the output of `tsinfer `_, which efficiently infers tree sequence *topologies* from large genetic datasets. ``tsdate`` and ``tsinfer`` are scalable to the largest genomic datasets currently available. - -The algorithm is described `in this Science paper `_ -(preprint `here `_). We also provide evaluations of the accuracy and computational requirements of the method using both simulated and real data; the code to reproduce these results can be found in `another repository `_. - -Please cite this paper if you use ``tsdate`` in published work: - -> Anthony Wilder Wohns, Yan Wong, Ben Jeffery, Ali Akbari, Swapan Mallick, Ron Pinhasi, Nick Patterson, David Reich, Jerome Kelleher, and Gil McVean (2022) *A unified genealogy of modern and ancient genomes*. Science **375**: eabi8264; doi: https://doi.org/10.1126/science.abi8264 - diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 27f573b8..00000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - -:end -popd diff --git a/docs/methods.md b/docs/methods.md new file mode 100644 index 00000000..29c0e465 --- /dev/null +++ b/docs/methods.md @@ -0,0 +1,133 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_methods)= + +# Methods + +The methods available for `tsdate` inference can be divided into _discrete-time_ +and _continuous-time_ approaches. Discrete-time approaches define time grid based on +discrete timepoints, and assign a probability to each node being at +each timepoint. Continuous-time +approaches approximate the probability distribution of times by a continuous +mathematical function (e.g. a gamma distribution). + +In tests, we find that the continuous-time `variational_gamma` approach is +the most accurate (but can suffer from numerical stability). The discrete-time +`inside_outside` approach is slightly less accurate, especially for older times, +but is more numerically robust, and the discrete-time `maximization` approach is +always stable but is the least accurate. + +Changing the method is very simple: + + +```{code-cell} ipython3 +import tskit +import tsdate + +input_ts = tskit.load("data/basic_example.trees") +ts = tsdate.date(input_ts, method="variational_gamma", population_size=100, mutation_rate=1e-8) +``` + +Currently the default is `inside_outside`, but this may change in future releases. + + +(sec_methods_discrete_time)= + +## Discrete-time + +The `inside_outside` and `maximization` methods both implement discrete-time +algorithms. These have the following advantages and disadvantages: + +Pros +: allows any shape for the distribution of times +: Currently require just a single upwards and downward pass through the edges + +Cons +: Choice of grid timpoints is somewhat arbitrary (but reasonable defaults picked + based on the conditional coalescent) +: Inferred times are imprecise due to discretization: a denser timegrid can increase + precision, but also increases computational cost (quadratic with number of timepoints) +: In particular, the oldest nodes can suffer from poor dating, as time into the past + is an unbounded value, but a single oldest timepoint must be chosen. + +### Inside Outside vs Maximization + +The `inside_outside` approach has been shown to perform better empirically, but +suffers from the theoretical problem of "loopy belief propagation". Occasionally +it also has issues with numerical stability, although this is commonly indicative +of pathological combinations of tree sequence topology and mutation patterns. +Issues like this are often caused by long regions of the genome that +have no mapped mutations (e.g. in the centromere), which can be removed by +{ref}`preprocessing`. + +The `maximization` approach is slightly less accurate empirically, +and will not return true posteriors, but is theoretically robust and +additionally is always numerically stable. + +(sec_methods_continuous_time)= + +## Continuous-time + +The only continuous-time algorithm currently implemented is the `variational_gamma` +method. + +Pros +: Time estimates are precise, and not affected by choice of timepoints. +: No need to define (possibly arbitrary) timegrid, and no quadratic scaling + with number of timepoints +: Old nodes do not suffer from time-discretisation issues caused by forcing + bounds on the oldest times + +Cons +: Assumes posterior times can be reasonably modelled by a gamma distribution + (e.g. they are not bimodal) +: The "Expectation propagation" approach requires multiple rounds of iteration + until convergence. +: Numerical stability issues are more common (but often indicate pathological + of otherwise problematic tree sequences) + +### The variational gamma method + +The `variational_gamma` method approximates times by fitting a separate gamma +distribution for each node. Iteration is required to converge +to a stable solution. + +We are in the process of writing a formal description of the algorithm, but in +summary, this approach uses an expectation propagation ("message passing") +approach to update the gamma distribution for each node based on the times of connected +nodes and the mutations on connected edges. Updates take the form of moment matching +against an iteratively refined approximation to the posterior, which makes this method +very fast. + +The iterative expectation propagation should converge to a fixed +point that approximately minimizes KL divergence between the true posterior +distribution and the approximation {cite}`minka2001expectation`. +At the moment, when `method="variational_gamma"`, +a relatively large number of iterations is used (which testing indicates is +more than enough) but convergence is not formally checked. +A stopping criterion will be implemented in future releases. + +Progress through iterations can be output using the progress bar: + +```{code-cell} ipython3 +ts = tsdate.date( + input_ts, + method="variational_gamma", + progress=True, + population_size=100, + mutation_rate=1e-8) +``` \ No newline at end of file diff --git a/docs/priors.md b/docs/priors.md new file mode 100644 index 00000000..45ca6dc1 --- /dev/null +++ b/docs/priors.md @@ -0,0 +1,107 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_priors)= + +# Specifying a Prior + +## Basic usage + +The {func}`build_prior_grid` function allows you to create a bespoke prior. +This can be passed in to {func}`date` using the `priors` argument. It provides +a tuneable alternative to passing the {ref}`population size` +directly to the {func}`date` function. + +Along with adjusting the {ref}`method`, +this is the recommended way to carry out more sophisticated +analyses, tweaking the `tsdate` algorithm to alter its runtime and accuracy. + +```{code-cell} ipython3 +import tskit +import tsdate +ts = tskit.load("data/basic_example.trees") +mu = 1e-8 # mutation rate +N = 100 # effective population size + +prior1 = tsdate.build_prior_grid(ts, population_size=N) +prior2 = tsdate.build_prior_grid(ts, population_size=N, timepoints=40) +prior3 = tsdate.build_prior_grid(ts, population_size=N, prior_distribution="gamma") + +ts1 = tsdate.date(ts, mu, priors=prior1) # Identical to tsdate.date(ts, mu, population_size=N) +ts2 = tsdate.date(ts, mu, priors=prior2) # Uses a denser timegrid +ts3 = tsdate.date(ts, mu, priors=prior3) # Approximates the discrete-time priors with a gamma +``` + +See below for more explanation of the interpretation of the parameters passed to +{func}`build_prior_grid`. + +## Prior shape + +For {ref}`sec_methods_discrete_time` methods, it is possible to switch from the (default) +lognormal approximation to a gamma distribution, used when building a mixture prior +for nodes that have variable numbers of children in different genomic regions. Tests have shown +that the lognormal is usually a better fit to the true prior in most cases. + +For {ref}`sec_methods_continuous_time` methods, only the gamma distribution is available. + +(sec_priors_timegrid)= + +## Setting the timegrid + +For {ref}`sec_methods_discrete_time` methods, a grid of timepoints is created. An explicit +timgrid can be passed via the `timepoints` parameter. Alternatively, if a single integer is +passed, a nonuniform time grid will be chosen based on the expected quantiles of the +coalescent approximation. + +Note that if an integer is given this is *not* the number of timepoints, but the number of +quantiles used as a basis for generating timepoints. The actual number of timepoints will +be larger than this number. For instance + +```{code-cell} ipython3 +timepoints = 10 +prior = tsdate.build_prior_grid(ts, population_size=N, timepoints=timepoints) +dated_ts = tsdate.date(ts, mu, priors=prior) + +print( + f"`timepoints`={timepoints}, producing a total of {len(prior.timepoints)}", + "timepoints in the timegrid, at these times" +) +print(prior.timepoints) +``` + +(sec_priors_conditional_coalescent)= + +## The conditional coalescent + +Currently, priors are based on the [conditional coalescent](http://dx.doi.org/10.1006/tpbi.1998.1411). +Specifically, in a tree sequence of `s` samples, the distribution of times for a node that always +has `n` descendant samples is taken from the theoretical distribution of times +for a node with `n` descendant tips averaged over all coalescent trees of `s` total +tips (note that this assumes that all the samples are at time 0) + +In most tree sequences, a node will not always have the same number of +descendant samples in all regions of the genome. For such nodes, an approximate prior +is constructed by averaging the mean and variance of the times, and then constructing +a mixture prior based on a lognormal (default) or gamma distribution with the same mean +and variance, weighted by the span of the node in each local tree. + +It is unclear how well this approximation works in practice, as there are clear +correlations between the span of a node and the number of children it has. Testing +indicates that using a single prior for all nodes may provide better accuracy; this +may also be a result of too strongly constraining each node prior to a fixed topology +before dating. Currently the `variational_gamma` method using a single ("global") prior +for all nodes. The best prior to use for different methods is a current topic of +investigation, and may be subject to change in future versions of `tsdate`. \ No newline at end of file diff --git a/docs/python-api.md b/docs/python-api.md new file mode 100644 index 00000000..ae170ef8 --- /dev/null +++ b/docs/python-api.md @@ -0,0 +1,55 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_python_api)= + +# Python API + +This page provides formal documentation for the `tsdate` Python API. + + +## Running tsdate + +```{eval-rst} +.. autofunction:: tsdate.date +``` + +## Prior and Time Discretisation Options + +```{eval-rst} +.. autofunction:: tsdate.build_prior_grid + +.. autoclass:: tsdate.base.NodeGridValues +``` + +## Variable population sizes + +```{eval-rst} +.. autoclass:: tsdate.demography.PopulationSizeHistory +``` + +## Preprocessing Tree Sequences + +```{eval-rst} +.. autofunction:: tsdate.preprocess_ts +``` + +# Functions for Inferring Tree Sequences with Historical Samples + +```{eval-rst} +.. autofunction:: tsdate.sites_time_from_ts +.. autofunction:: tsdate.add_sampledata_times +``` diff --git a/docs/python-api.rst b/docs/python-api.rst deleted file mode 100644 index 08b417d7..00000000 --- a/docs/python-api.rst +++ /dev/null @@ -1,37 +0,0 @@ -.. currentmodule:: tsdate -.. _sec_python_api: - -========== -Python API -========== - -This page provides documentation for the ``tsdate`` Python API. - - -************** -Running tsdate -************** - -.. autofunction:: tsdate.date - - -++++++++++++++++++++++++++++++++++++++++++++++++ -Specifying Prior and Time Discretisation Options -++++++++++++++++++++++++++++++++++++++++++++++++ - -.. autofunction:: tsdate.build_prior_grid - - -**************************** -Preprocessing Tree Sequences -**************************** - -.. autofunction:: tsdate.preprocess_ts - -************************************************************** -Functions for Inferring Tree Sequences with Historical Samples -************************************************************** - -.. autofunction:: tsdate.sites_time_from_ts -.. autofunction:: tsdate.add_sampledata_times - diff --git a/docs/references.bib b/docs/references.bib new file mode 100644 index 00000000..ecdab98d --- /dev/null +++ b/docs/references.bib @@ -0,0 +1,39 @@ +@article{kelleher2019inferring, + author = {Kelleher, Jerome and Wong, Yan and Wohns, Anthony W. + and Fadil, Chaimaa and Albers, Patrick K. and McVean, Gil}, + journal = {Nature Genetics}, + number = 9, + pages = {1330--1338}, + title = {Inferring whole-genome histories in large population datasets}, + volume = 51, + year = 2019, + doi = {10.1038/s41588-019-0483-y}, +} + +@article{wohns2022unified, + author = {Anthony Wilder Wohns and Yan Wong and Ben Jeffery + and Ali Akbari and Swapan Mallick and Ron Pinhasi + and Nick Patterson and David Reich + and Jerome Kelleher and Gil McVean }, + title = {A unified genealogy of modern and ancient genomes}, + journal = {Science}, + volume = {375}, + number = {6583}, + pages = {eabi8264}, + year = {2022}, + doi = {10.1126/science.abi8264}, +} + +@inproceedings{minka2001expectation, + author = {Minka, Thomas P.}, + title = {Expectation Propagation for Approximate Bayesian Inference}, + year = {2001}, + isbn = {1558608001}, + publisher = {Morgan Kaufmann Publishers Inc.}, + address = {San Francisco, CA, USA}, + booktitle = {Proceedings of the Seventeenth Conference on Uncertainty in Artificial Intelligence}, + pages = {362–369}, + numpages = {8}, + location = {Seattle, Washington}, + series = {UAI'01} +} diff --git a/docs/requirements.txt b/docs/requirements.txt index 3b284ec2..b6742bcc 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,12 +1,19 @@ attrs -recommonmark six -setuptools_scm -sphinx-argparse -sphinx_rtd_theme -sphinx_issues numba tskit scipy tqdm appdirs +pandas +jupyter-book==0.15.1 +sphinx-issues==3.0.1 +sphinx-argparse==0.4.0 +sphinxcontrib-bibtex +humanize==4.7.0 +lmdb==1.4.1 +tqdm==4.65.0 +daiquiri==3.2.1 +msprime==1.2.0 +ipywidgets==8.1.0 +sphinx-book-theme #Unpinned to allow easy updating. diff --git a/docs/tutorial.rst b/docs/tutorial.rst deleted file mode 100644 index 0f3cceda..00000000 --- a/docs/tutorial.rst +++ /dev/null @@ -1,218 +0,0 @@ -.. currentmodule:: tskit -.. _sec_tutorial: - -======== -Tutorial -======== - -.. _sec_tutorial_tsdate: - -********************* -Dating Tree Sequences -********************* - -To illustrate the typical use case of ``tsdate``'s Python API, we will first create -sample data with `msprime `_ and infer a tree -sequence from this data using `tsinfer `_. -We will then run ``tsdate`` on the inferred tree sequence. - -Let's start by creating some sample data with ``msprime`` using human-like parameters. - -.. code-block:: python - - import msprime - - sample_ts = msprime.simulate(sample_size=10, Ne=10000, - length=1e4, - mutation_rate=1e-8, - recombination_rate=1e-8, - random_seed=2) - print(sample_ts.num_trees, - sample_ts.num_nodes) - -The output of this code is: - -.. code-block:: python - - 12 29 - -We take this simulated tree sequence and turn it into a `tsinfer` SampleData object as -documented `here `_, and then infer a tree sequence from -the data - -.. code-block:: python - - import tsinfer - - sample_data = tsinfer.SampleData.from_tree_sequence(sample_ts) - inferred_ts = tsinfer.infer(sample_data) - -.. note:: ``tsdate`` works best with `simplified tree sequences - `_ (``tsinfer``'s documentation provides) - details on how to simplify an inferred tree sequence. This should not be an issue - when working with tree sequences simulated using ``msprime``. - -Next, we run `tsdate` to estimate the ages of nodes and mutations in the inferred tree -sequence: - -.. code-block:: python - - import tsdate - dated_ts = tsdate.date(inferred_ts, Ne=10000, mutation_rate=1e-8) - -All we need to run ``tsdate`` (with its default parameters) is the inferred tree sequence -object, the *estimated* effective population size, and *estimated* mutation rate. Here we -have provided a human mutation rate per base pair per generation, so the nodes dates in the -resulting tree sequence should be interpreted as generations. - -.. _sec_tutorial_specify_prior: - -+++++++++++++++++++ - Specifying a Prior -+++++++++++++++++++ - -The above example shows the basic use of ``tsdate``, using default parameters. The -software has parameters the user can access through the :meth:`tsdate.build_prior_grid()` -function which may affect the runtime and accuracy of the algorithm. - -.. _sec_tutorial_returned_dates: - -++++++++++++++ -Returned dates -++++++++++++++ - -Several different node time estimates are returned by :func:`tsdate.date()`. In particular, -to conform to the requirements of a valid tree sequence, child nodes must always be strictly -younger than their parents. In the unusual case that a child has an estimated time which is -older than its parent, ``tsdate`` increases the node time of the parent in the tree -sequence so that it is constrained to be fractionally older than its oldest child. - -In the case of the inside-outside method (see below), the estimated time is based on the -posterior mean node time. The true posterior mean time (unconstrained by the tree sequence -requirements above) is stored in the node metadata. This can be consulted if you want to -obtain the raw mean times from the posterior. - -For even greater detail about the posterior time estimates for each node, you can use the -``return_posteriors`` option, which will return the full distribution of posterior -probabilities in each time slice. - -.. _sec_tutorial_inside_outside_v_maximization: - ------------------------------- -Inside Outside vs Maximization ------------------------------- - -One of the most important parameters to consider is whether ``tsdate`` should use the -inside-outside or the maximization algorithms to perform inference. A detailed -description of the algorithms will be presented in our preprint, but from the users -perspective, the inside-outside approach performs better empirically, and returns -a full posterior distribution, but occasionally has issues with numerical stability. In -contrast, the maximization approach is slightly less accurate empirically, and will -not return true posteriors, but has the advantage of always being numerically stable. - -.. _command_line_interface: - -++++++++++++++++++++++++++++++ -Command Line Interface Example -++++++++++++++++++++++++++++++ - -``tsdate`` also offers a convenient :ref:`command line interface (CLI) ` for -accessing the core functionality of the algorithm. - -For a simple example of CLI, we'll first save the inferred tree sequence we created in -:ref:`the section above ` as a file. - -.. code-block:: python - - import tskit - - inferred_ts.dump("inferred_ts.trees") - -Now we use the CLI to again date the inferred tree sequence and output the resulting -dated tree sequence to ``dated_ts.trees`` file: - -:: - - $ tsdate date inferred_ts.trees dated_ts.trees 10000 1e-8 --progress - -The first two arguments are the input and output tree sequence file names, the third is the estimated effective population size, and the fourth is the estimated mutation rate. We also add the ``--progress`` option to keep track of ``tsdate``'s progress. - -.. _troubleshooting: - -++++++++++++++++++++++ -Troubleshooting tsdate -++++++++++++++++++++++ - -If numerical stability issues are encountered when attempting to date -tree sequences using the Inside-Outside algorithm, it may be necessary to remove -large sections of the tree which do not have any variable sites using -:meth:`tsdate.preprocess_ts()` method. - -.. _historical_samples: - -********************************************************************* -Inferring and Dating Tree Sequences with Historical (Ancient) Samples -********************************************************************* - -``tsdate`` and ``tsinfer`` can be used together to infer tree sequences from both -modern and historical samples. The following recipe shows how this is accomplished -with a few lines of Python. The only requirement is a tsinfer.SampleData file with -modern and historical samples (the latter are specified using the -`individuals_time` array in a tsinfer.SampleData file). - -.. code-block:: python - - import msprime - import tsdate - import tsinfer - import tskit - - import numpy as np - - - def make_historical_samples(): - samples = [ - msprime.Sample(population=0, time=0), - msprime.Sample(0, 0), - msprime.Sample(0, 0), - msprime.Sample(0, 0), - msprime.Sample(0, 1.0), - msprime.Sample(0, 1.0) - ] - sim = msprime.simulate(samples=samples, mutation_rate=1, length=100) - # Get the SampleData file from the simulated tree sequence - # Retain the individuals times and ignore the sites times. - samples = tsinfer.SampleData.from_tree_sequence( - sim, use_sites_time=False, use_individuals_time=True) - return samples - - def infer_historical_ts(samples, Ne=1, mutation_rate=1): - """ - Input is tsinfer.SampleData file with modern and historical samples. - """ - modern_samples = samples.subset(np.where(samples.individuals_time[:] == 0)[0]) - inferred_ts = tsinfer.infer(modern_samples) # Infer tree seq from modern samples - # Removes unary nodes (currently required in tsdate), keeps historical-only sites - inferred_ts = tsdate.preprocess_ts(inferred_ts, filter_sites=False) - dated_ts = tsdate.date(inferred_ts, Ne=Ne, mutation_rate=mutation_rate) # Date tree seq - sites_time = tsdate.sites_time_from_ts(dated_ts) # Get tsdate site age estimates - dated_samples = tsdate.add_sampledata_times( - samples, sites_time) # Get SampleData file with time estimates assigned to sites - ancestors = tsinfer.generate_ancestors(dated_samples) - ancestors_w_proxy = ancestors.insert_proxy_samples( - dated_samples, allow_mutation=True) - ancestors_ts = tsinfer.match_ancestors(dated_samples, ancestors_w_proxy) - return tsinfer.match_samples( - dated_samples, ancestors_ts, force_sample_times=True) - - samples = make_historical_samples() - inferred_ts = infer_historical_ts(samples) - -We simulate a tree sequence with six sample chromosomes, four modern and -two historical. We then infer and date a tree sequence using only the modern -samples. Next, we find derived alleles which are carried by the historical samples and -use the age of the historical samples to constrain the ages of these alleles. Finally, -we reinfer the tree sequence, using the date estimates from tsdate and the historical -constraints rather than the frequency of the alleles to order mutations in ``tsinfer``. -Historical samples are added to the ancestors tree sequence as `proxy nodes, in addition -to being used as samples `_. diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 00000000..beeb8570 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,304 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_usage)= + +# Usage + +We'll first generate a few "undated" tree sequences for later use: + +```{code-cell} ipython3 +:tags: [hide-input] +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) # don't display FutureWarnings for stdpopsim +import msprime +import numpy as np +import stdpopsim +import tsinfer +import tskit + +# Use msprime to create a simulated tree sequence with mutations, for demonstration +n = 10 +Ne = 100 +mu = 1e-6 +ts = msprime.sim_ancestry(n, population_size=Ne, sequence_length=1e6, random_seed=123) +ts = msprime.sim_mutations(ts, rate=mu, random_seed=123, discrete_genome=False) +# Remove time information +tables = ts.dump_tables() +tables.nodes.time = np.where(tables.nodes.flags & tskit.NODE_IS_SAMPLE, 0, np.arange(ts.num_nodes, dtype=float)) +tables.mutations.time = np.full(ts.num_mutations, tskit.UNKNOWN_TIME) +tables.time_units = tskit.TIME_UNITS_UNCALIBRATED +sim_ts = tables.tree_sequence() + +# Use stdpopsim to create simulated genomes for inference +species = stdpopsim.get_species("HomSap") +model = species.get_demographic_model("AmericanAdmixture_4B11") +contig = species.get_contig("chr1", left=1e7, right=1.1e7, mutation_rate=model.mutation_rate) +samples = {"AFR": 5, "EUR": 5, "ASIA": 5, "ADMIX": 5} +# Create DNA sequences, stored in the tsinfer SampleData format +stdpopsim_ts = stdpopsim.get_engine("msprime").simulate(model, contig, samples, seed=123) +sample_data = tsinfer.SampleData.from_tree_sequence(stdpopsim_ts) +inf_ts = tsinfer.infer(sample_data) + +print(f"* Simulated `sim_ts` ({2*n} genomes from a popn of {Ne} diploids, mut_rate={mu} /bp/gen)") +print(f"* Inferred `inf_ts` using tsinfer ({stdpopsim_ts.num_samples} samples of human {contig.origin})") + +``` +(sec_usage_basic_example)= + +## Quickstart + +Given a known genetic genealogy in the form of a tree sequence, `tsdate` simply +re-estimates the node times based on the mutations on each edge. Usage is as +simple as calling the {func}`date` function with an *estimated* effective population +size, and *estimated* per base pair per generation mutation rate. + +```{code-cell} ipython3 +import tsdate +# Running `tsdate` is usually a single function call, as follows: +redated_ts = tsdate.date(sim_ts, population_size=100, mutation_rate=1e-6) +``` + +This simple example has no recombination, infinite sites mutation, +a high mutation rate, and a known genealogy, so we would expect that the node times +as estimated by tsdate from the mutations would be very close to the actual node times: + +```{code-cell} ipython3 +:tags: [hide-input] +from matplotlib import pyplot as plt +import numpy as np +def plot_real_vs_tsdate_times(x, y, ts_x=None, ts_y=None, delta = 0.1, **kwargs): + plt.scatter(x + delta, y + delta, **kwargs) + plt.xscale('log') + plt.xlabel(f'Real time' + ('' if ts_x is None else f' ({ts_x.time_units})')) + plt.yscale('log') + plt.ylabel(f'Estimated time from tsdate' + ('' if ts_y is None else f' ({ts_y.time_units})')) + line_pts = np.logspace(np.log10(delta), np.log10(x.max()), 10) + plt.plot(line_pts, line_pts, linestyle=":") + +plot_real_vs_tsdate_times(ts.nodes_time, redated_ts.nodes_time, ts, redated_ts) +``` + +:::{note} +See the [Timescale adjustment](sec_usage_popsize_timescale) section if you wish to +use anything other than the default `time_units="generations"`. +::: + +(sec_usage_inferred_example)= + +## Inferred topologies + +A more typical use-case is where the genealogy has been inferred from DNA sequence data, +for example by {ref}`tsinfer` or +[Relate](https://myersgroup.github.io/relate/). Below we will demonstrate with `tsinfer` +output based on DNA sequences generated by a more realistic simulation. + +With real data, especially from `tsinfer` you may want to {func}`preprocess` +the tree sequence before dating. This removes regions with no variable sites, and +also simplifies to remove locally unary portions of nodes (see the +{ref}`sec_usage_real_data_simplify` section below for more details) + +```{code-cell} ipython3 +import tsdate +simplified_ts = tsdate.preprocess_ts(inf_ts) +dated_ts = tsdate.date(simplified_ts, population_size=10000, mutation_rate=model.mutation_rate) +print( + f"Dated `inf_ts` (inferred from {inf_ts.num_sites} variants under the {model.id}", + f"stdpopsim model, mutation rate = {model.mutation_rate} /bp/gen)" +) +``` +:::{note} +There was not a fixed population size in the simulation used to generate the data, +so we have used a rough commonly-used +estimate of an human effective population size of 20,000 (see the +[Variable population sizes]`sec_variable_popsize` section for more +sophisticated approaches). +::: + +The inference in this case is much more noisy (as illustrated using the original +and inferred times of the node under each mutation): + +```{code-cell} ipython3 +:tags: [hide-input] +# If there are multiple mutations at a site, we simply pick the first one +plot_real_vs_tsdate_times( + stdpopsim_ts.nodes_time[[s.mutations[0].node for s in stdpopsim_ts.sites()]], + dated_ts.nodes_time[[s.mutations[0].node for s in dated_ts.sites()]], + delta=100, + alpha=0.1, +) +``` + +(sec_usage_posterior)= + +## Posterior time distributions + +The default output of `tsdate` is a new, dated tree sequence, +created with node times changed to the mean time for each node. This is based +on the means of the posterior time distributions for each node. + +In the unusual occurrence +of the mean time of a child node being older than the mean time of on of its parents, +a small value, $\epsilon$, is added to the parent time to ensure a valid tree sequence. +The original mean times and their variances are stored in the node {ref}`tskit:sec_metadata`. + +The full posterior distributions of times for each node can be accessed by +specifying `return_posteriors=True` +when calling {func}`tsdate.date`, which then returns both the dated tree sequence +and a dictionary specifying the posterior distributions. + +The returned posterior is a dictionary keyed by integer node ID, with values representing the +probability distribution of times. This can be read in to a [pandas](https://pandas.pydata.org +dataframe: + +```{code-cell} ipython3 +import pandas as pd +redated_ts, posteriors = tsdate.date( + sim_ts, population_size=100, mutation_rate=1e-6, method="inside_outside", return_posteriors=True) +posteriors_df = pd.DataFrame(posteriors) +posteriors_df.head() # Show the dataframe +``` + +Since we are using a {ref}`sec_methods_discrete_time` method, each row of this +posterior dataframe corresponds to a set of probabilities for a given timeslice +(as specified in the `start_time` and `end_time` columns). Each column corresponds to +a node (the column header is the node ID in the returned tree sequence). + +(sec_usage_popsize)= + +## Population sizes + +The `population_size` can either be a single number, specifying the "effective population size", +or a piecewise constant function of time, specifying a set of fixed population sizes +over a number of contiguous time intervals. Functions of this sort are captured by the +{class}`~demography.PopulationSizeHistory` class: see the {ref}`sec_variable_popsize` page +for its use and interpretation. + +(sec_usage_popsize_timescale)= + +### Timescale adjustment + +There is one gotcha involving the population size and the assumed time units used for dating, +because theoretically, the effective population size determines the coalescent timescale. +If you are using a time scale other than "generations", as well as setting the "time_scale" +parameter (e.g. to "years"), and quoting parameters such as `mutation_rate` in terms of a +per-bp per-year rate, you will also need to modify the effective population size passed to +`tsdate` by multiplying it by the generation time. For example: + +```{code-cell} ipython3 +import numpy as np +popsize = 100 # Diploid population size +mutation_rate_per_gen = 1e-8 +# By default, dates are in generations +ts_generations = tsdate.date(ts, mutation_rate_per_gen, popsize) + +# To infer dates in years, adjust both the rates and the population size: +generation_time = 30 # Years +mutation_rate_per_year = mutation_rate_per_gen / generation_time +ts_years = tsdate.date( + ts, mutation_rate_per_year, popsize * generation_time, time_units="years") + +# Check that the inferred node times are identical, just on different scales +assert np.allclose(ts_generations.nodes_time, ts_years.nodes_time / generation_time, 5) +``` + +(sec_usage_real_data)= + +## Real data + +Real world data is likely to consist of larger datasets than in the example, and +may exhibit issues that are not present in simulated data and can e.g. cause numerical +instability and other problems. Here we detail some common issues found in real data. + +(sec_usage_real_data_scaling)= + +### Memory and run time + +`Tsdate` is not particularly memory intensive: whole genome tree sequences with + +Running the dating algorithm is linear in the number of edges in the tree sequence. +This makes `tsdate` usable even for large tree sequences (e.g. millions of samples). +Nevertheless, dating large tree sequences with millions of edges is likely to take +some time (e.g. an hour or more for a tree sequence of 11 million edges covering +150Mb of 7500 human chromosome 2, e.g. from {cite}`wohns2022unified`). +If you are running `tsdate` interactively, it can be useful to +specify the `progress` option to display a progress bar telling you how long +different stages of dating will take. + +If the {ref}`method` used for dating involves discrete time slices, `tsdate` scales +quadratically in the number of time slices chosen. For greater temporal resolution, +you are thus advised to use the `variational_gamma` method, which does not discretize time. + +#### Optimisations + +Before the dating algorithm is run, the conditional coalescent prior distribution must +be calculated for each node. Although this is roughly linear in the number of nodes, +it can still take some time if there are millions of nodes. +To speed up this process an approximation to the conditional coalescent +is used for nodes which have a large number of descendants +(the exact number can be modified by {ref}`making a prior` +using the `approx_prior_size` parameter). +The results of this approximation are cached, so that `tsdate` will run slightly faster +the second time it is run. + +The time taken to date a tree sequence using `tsdate` is only a fraction of that +requires to infer the initial tree sequence, therefore the core `tsdate` algorithm +has not been parallelised to allow running on many CPU cores. However, there is some +precalculation of regularly reused mutational likelihoods which *can* be parallelised +easily: this step can be sped up by specifying the `num_threads` parameter to +{func}`date` (however, this behaviour is subject to change in future versions). + +### CLI use + +Computationally-intensive uses of `tsdate` are likely to involve +running the program non-interactively, e.g. as part of an +automated pipeline. In this case, it may be useful to use the +command-line interface. See {ref}`sec_cli` for more details. + +(sec_usage_real_data_stability)= + +### Numerical stability and preprocessing + +Numerical stability issues are usually caused by "bad" tree sequences (i.e. +pathological combinations of topologies and mutations). These can be caused, +for example, by long deep branches with very few mutations, such as samples attaching directly +to a local root. These can often be fixed by removing bad regions of the tree sequence, +e.g. regions that have no variation because they are unmappable or have been +removed by a QC filter, such as at the centromere. +Numerical instability will manifest itself by raising an error +when dating. + +The {func}`tsdate.preprocess_ts()` function can help remove topology from these +regions. See the documentation for that function for details on how to increase +or decrease its stringency. + +The `variational_gamma` method is more prone to instability, and switching to +another method may help. Note, however, that this is usually a sign that +you should re-inspect the original tree sequence, which is likely to +have poorly inferred topologies. + +(sec_usage_real_data_simplify)= + +### Simplification and unary nodes + +Above we have simplified the tree sequence to remove "locally unary" nodes +(i.e. nodes that, in some trees, have only one child). The `tsinfer` +algorithm produces these by default, but often overestimates the span of genome +that they cover, which causes numerical instability and inaccuracy. + +However, we expect *accurately* estimated unary regions to improve dating. +It is therefore possible to date a tree sequence containing locally unary nodes +using the `allow_unary` option when {ref}`building a prior`. \ No newline at end of file diff --git a/docs/variable_popsize.md b/docs/variable_popsize.md new file mode 100644 index 00000000..694f3239 --- /dev/null +++ b/docs/variable_popsize.md @@ -0,0 +1,103 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tsdate +::: + +(sec_variable_popsize)= + +# Variable population sizes + +:::{note} +Functionality described on this page is preliminary and subject to change in the future. For this reason, +classes and methods may not form part of the publicly available API and may not be fully documented yet. +::: + +The population size determines the prior probability of coalescence events over time. +When setting the `tsdate` prior, a reasonable first order approximation is to assume a constant +population size. However, better results will be gained if the prior more accurately reflects +the true history. To illustrate this, we will generate data from a population that was large +in recent history, but had a small bottleneck of only 200 individuals +50,000 generations ago, with a medium-sized population before that: + +```{code-cell} ipython3 +:tags: [hide-input] +from matplotlib import pyplot as plt +import numpy as np +def plot_real_vs_tsdate_times(x, y, ts_x=None, ts_y=None, delta = 0.1, **kwargs): + plt.scatter(x + delta, y + delta, **kwargs) + plt.xscale('log') + plt.xlabel(f'Real time' + ('' if ts_x is None else f' ({ts_x.time_units})')) + plt.yscale('log') + plt.ylabel(f'Estimated time from tsdate'+ ('' if ts_y is None else f' ({ts_y.time_units})')) + line_pts = np.logspace(np.log10(delta), np.log10(x.max()), 10) + plt.plot(line_pts, line_pts, linestyle=":") +``` + +```{code-cell} ipython3 +import msprime + +demography = msprime.Demography() +demography.add_population(name="A", initial_size=1e6) +demography.add_population_parameters_change(time=50000, initial_size=2e2) +demography.add_population_parameters_change(time=50010, initial_size=1e4) + +mutation_rate = 1e-8 +# Simulate a tree sequence with a population size history. +ts = msprime.sim_ancestry( + 10, sequence_length=1e5, recombination_rate=1e-8, demography=demography, random_seed=123) +ts = msprime.sim_mutations(ts, rate=mutation_rate, random_seed=123) +``` + +If we redate this tree sequence with a constant population size of (say) the ancestral size of 10,000, we +get a poor fit to the true times: + +```{code-cell} ipython3 +import tsdate +redated_ts = tsdate.date(ts, mutation_rate, population_size=1e4) +plot_real_vs_tsdate_times(ts.nodes_time, redated_ts.nodes_time, ts, redated_ts, delta=1000, alpha=0.1) +``` + +## Setting variable population sizes + +The {class}`demography.PopulationSizeHistory` class can be used to define a population size that +changes in a piecewise-constant manner over time (that is, the population size is constant between +specified time intervals). This can them be used to create a prior, via the {func}`build_prior_grid` +function (see {ref}`sec_priors`). + +For example, the following code defines a population that is of effective size +1 million in the last 200 generations, only two hundred for a period of 10 generations 50,000 generations ago, then +of size 50,000 for all generations before that, exactly matching the simulated bottleneck + +```{code-cell} ipython3 +popsize = tsdate.demography.PopulationSizeHistory(population_size=[1e6, 2e2, 1e4], time_breaks=[50_000, 50_010]) +``` + +We can then use this to create a prior for dating, rather than specifying a constant population size. This +gives a much better fit to the true times: + +```{code-cell} ipython3 +prior = tsdate.build_prior_grid(ts, popsize) +redated_ts = tsdate.date(ts, mutation_rate, priors=prior) +plot_real_vs_tsdate_times(ts.nodes_time, redated_ts.nodes_time, ts, redated_ts, delta=1000, alpha=0.1) +``` + +## Estimating population size + +If you don't know the population size, it is possible to use `tsdate`` to *estimate* changes in +population size over time, by extimating the number of coalescence points in different time +intervals, and re-estimating the dates. However, this +approach has not been fully tested or documented. + +If you are interested in doing this, see https://github.com/tskit-dev/tsdate/issues/237#issuecomment-1785655708 +for an example. \ No newline at end of file diff --git a/tsdate/base.py b/tsdate/base.py index 337afe59..e1716ecc 100644 --- a/tsdate/base.py +++ b/tsdate/base.py @@ -25,7 +25,6 @@ """ import numpy as np - FLOAT_DTYPE = np.float64 LIN = "linear" LOG = "logarithmic" @@ -42,6 +41,13 @@ class NodeGridValues: with fixed times, only a single time value needs to be stored. For non-fixed nodes, an array of len(timepoints) probabilies is required. + .. note:: + + This class is not intended to be used directly by users and may be subject + to change of name or internal structure in future versions. For details on + how to create a ``NodeGridValues`` object to be used as a prior, see + :ref:`sec_priors`. + :ivar num_nodes: The number of nodes that will be stored in this object :vartype num_nodes: int :ivar nonfixed_nodes: a (possibly empty) numpy array of unique positive node ids each @@ -52,7 +58,7 @@ class NodeGridValues: :ivar timepoints: Array of time points :vartype timepoints: numpy.ndarray :ivar fill_value: What should we fill the data arrays with to start with - :vartype fill_value: numpy.scalar + :vartype fill_value: float """ def __init__( diff --git a/tsdate/demography.py b/tsdate/demography.py index 47324d50..589bebdc 100644 --- a/tsdate/demography.py +++ b/tsdate/demography.py @@ -30,7 +30,8 @@ class PopulationSizeHistory: """ Stores a piecewise constant population size history and tranforms time from - a natural (generational) scale to a coalescent one + a natural (generational) scale to a coalescent one. See :ref:`sec_variable_popsize` + for details. """ @staticmethod diff --git a/tsdate/prior.py b/tsdate/prior.py index f0ad2f9d..21371378 100644 --- a/tsdate/prior.py +++ b/tsdate/prior.py @@ -1180,14 +1180,17 @@ def build_grid( each node, given the number of contemporaneous samples below it, and the discretised time slices at which to evaluate node age. - :param TreeSequence tree_sequence: The input :class:`tskit.TreeSequence`, treated as - undated. - :param float population_size: The estimated (diploid) effective population - size: must be specified. May be a single value, or a two-column array with - epoch breakpoints and effective population sizes. Using standard (unscaled) - values for ``population_size`` results in a prior where times are measured - in generations. - :param int_or_array_like timepoints: The number of quantiles used to create the + :param tskit.TreeSequence tree_sequence: The input :class:`tskit.TreeSequence`, + treated as undated. + :param float or demography.PopulationSizeHistory population_size: The estimated + (diploid) effective population size used to construct the prior. + For a population with constant size, this can be given as a single + value. For a population with time-varying size, this can be given directly as + a :class:`~demography.PopulationSizeHistory` object or a parameter dictionary + passed to initialise a :class:`~demography.PopulationSizeHistory` object. + Using standard (unscaled) values for ``population_size`` results in a prior where + times are measured in generations. + :param int or array_like timepoints: The number of quantiles used to create the time slices, or manually-specified time slices as a numpy array. Default: 20 :param bool approximate_priors: Whether to use a precalculated approximate prior or exactly calculate prior. If approximate prior has not been precalculated, tsdate @@ -1202,7 +1205,7 @@ def build_grid( "lognorm" :return: A prior object to pass to tsdate.date() containing prior values for inference and a discretised time grid - :rtype: base.NodeGridValues Object + :rtype: base.NodeGridValues """ mixture_prior = MixturePrior( @@ -1231,7 +1234,7 @@ def parameter_grid( each node, given the number of contemporaneous samples below it, and return parameters (shape and rate of gamma) in a grid - :param TreeSequence tree_sequence: The input :class:`tskit.TreeSequence`, treated as + :param tskit.TreeSequence tree_sequence: The input tree sequence, treated as undated. :param float population_size: The estimated (diploid) effective population size: must be specified. May be a single value, or a two-column array with @@ -1244,7 +1247,7 @@ def parameter_grid( :param int approx_prior_size: Number of samples from which to precalculate prior. Should only enter value if approximate_priors=True. If approximate_priors=True and no value specified, defaults to 1000. Default: None - :rtype: base.NodeGridValues Object + :rtype: base.NodeGridValues """ mixture_prior = MixturePrior( diff --git a/tsdate/util.py b/tsdate/util.py index 8ab6bbb5..c6bde307 100644 --- a/tsdate/util.py +++ b/tsdate/util.py @@ -69,7 +69,7 @@ def preprocess_ts( generally. Removed regions are recorded in the provenance of the resulting tree sequence. - :param TreeSequence tree_sequence: The input :class`tskit.TreeSequence` + :param tskit.TreeSequence tree_sequence: The input tree sequence to be preprocessed. :param float minimum_gap: The minimum gap between sites to remove from the tree sequence. Default: ``None`` treated as ``1000000`` @@ -221,7 +221,7 @@ def sites_time_from_ts( the same as tskit.UNKNOWN_TIME, which marks sites that could have a meaningful time but whose time estimate is unknown). - :param TreeSequence tree_sequence: The input :class`tskit.TreeSequence`. + :param tskit.TreeSequence tree_sequence: The input tree sequence. :param bool unconstrained: Use estimated node times which have not been constrained by tree topology. If ``True`` (default), this requires a tree sequence which has been dated using the ``tsdate`` inside-outside algorithm. If this is not the @@ -252,7 +252,7 @@ def sites_time_from_ts( fine to set this to a small epsilon value. :return: Array of length tree_sequence.num_sites with estimated time of each site - :rtype: numpy.array + :rtype: numpy.ndarray(dtype=np.float64) """ if tree_sequence.num_sites < 1: raise ValueError("Invalid tree sequence: no sites present") From 5fd0db32fb2b755ca5de470a675ae21ac8b9b70e Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 6 Nov 2023 16:47:06 +0000 Subject: [PATCH 2/5] Create docs.yml --- .github/workflows/docs.yml | 48 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 .github/workflows/docs.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000..f42e95e1 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,48 @@ +name: Build Docs + +on: + pull_request: + push: + branches: [main] + tags: + - '*' + +env: + COMMIT_EMAIL: ben.jeffery.well+adminbot@gmail.com + OWNER: tskit-dev + REPO: tsdate + REQUIREMENTS: docs/requirements.txt + +jobs: + build-deploy-docs: + name: Docs + runs-on: ubuntu-latest + steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.11.0 + with: + access_token: ${{ github.token }} + + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + cache: 'pip' + + - name: Create venv and install deps + run: | + pip install --upgrade pip wheel + pip install -r ${{env.REQUIREMENTS}} + + - name: Build Docs + run: | + make -C docs + + - name: Trigger docs site rebuild + if: github.ref == 'refs/heads/main' + run: | + curl -X POST https://api.github.com/repos/tskit-dev/tskit-site/dispatches \ + -H 'Accept: application/vnd.github.everest-preview+json' \ + -u AdminBot-tskit:${{ secrets.ADMINBOT_TOKEN }} \ + --data '{"event_type":"build-docs"}' \ No newline at end of file From 9a77687cde1166403a00708165c88c395eeff92a Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 6 Nov 2023 16:47:22 +0000 Subject: [PATCH 3/5] Nicer index doc wording --- docs/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 927b3ea8..05adc08f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -41,8 +41,8 @@ The rest of this documentation is organised into the following sections: ## Citing -The algorithm is described in [this Science paper](https://www.science.org/doi/10.1126/science.abi8264) -(preprint [here](https://www.biorxiv.org/content/10.1101/2021.02.16.431497v2)). We also provide +The algorithm is described in [our Science paper](https://www.science.org/doi/10.1126/science.abi8264) +(citation below, preprint [here](https://www.biorxiv.org/content/10.1101/2021.02.16.431497v2)). We also provide evaluations of the accuracy and computational requirements of the method using both simulated and real data; the code to reproduce these results can be found in [another repository](https://github.com/awohns/unified_genealogy_paper). From 7bddce1d7001103c10a6405b614f2d2748b3a27e Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 6 Nov 2023 16:51:10 +0000 Subject: [PATCH 4/5] Add matplotlib to docs --- docs/requirements.txt | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index b6742bcc..bbeddccf 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,11 +1,15 @@ +appdirs attrs -six +matplotlib +mpmath numba -tskit +pandas scipy +six +stdpopsim==0.2.0 +tsinfer +tskit tqdm -appdirs -pandas jupyter-book==0.15.1 sphinx-issues==3.0.1 sphinx-argparse==0.4.0 From a7e8d0b17948bad0d082438b24c6c45dca02e81e Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Tue, 7 Nov 2023 12:25:38 +0000 Subject: [PATCH 5/5] Modify mergify rules As described by @benjeffery --- .mergify.yml | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/.mergify.yml b/.mergify.yml index 1a632e16..08d94c20 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -3,15 +3,8 @@ queue_rules: conditions: - "#approved-reviews-by>=1" - "#changes-requested-reviews-by=0" - - status-success=Lint - - status-success=Python (3.8, macos-latest) - - status-success=Python (3.10, macos-latest) - - status-success=Python (3.8, ubuntu-latest) - - status-success=Python (3.10, ubuntu-latest) - - status-success=Python (3.8, windows-latest) - - status-success=Python (3.10, windows-latest) - "status-success=ci/circleci: build" - + - status-success=Docs pull_request_rules: - name: Automatic rebase, CI and merge conditions: @@ -20,14 +13,8 @@ pull_request_rules: - "#changes-requested-reviews-by=0" - base=main - label=AUTOMERGE-REQUESTED - - status-success=Lint - - status-success=Python (3.8, macos-latest) - - status-success=Python (3.10, macos-latest) - - status-success=Python (3.8, ubuntu-latest) - - status-success=Python (3.10, ubuntu-latest) - - status-success=Python (3.8, windows-latest) - - status-success=Python (3.10, windows-latest) - "status-success=ci/circleci: build" + - status-success=Docs actions: queue: name: default