From c3e76d4d1bc0c4e42810a94b99ffcf2fc73d899a Mon Sep 17 00:00:00 2001 From: Tommaso Leonardi Date: Wed, 10 Apr 2019 18:47:43 +0200 Subject: [PATCH] Release v1.0.0rc2 (#80) * Added empty changelog * Small doc changes * Typo and reformat doc * Fixed issue #68 * Added travis file * Added branches safelist * Removed python 3.2 * Added devel branch to travis safelist * Updated pytest version * Replcated tmpdir_factory with tmp_path_factory * Fixed python version compatibility * Fixed syntax error * Casting pathlib objects to string for compatibility with python3.5 * Fixed python 3.5 compatibility error * Fixed issue #68 * Removed trailing whitespace * Added travis badge * Skip many integration tests if running inside Travis * Added conditional workflow for 3.5 version * Lowered p-value tolerance for python 3.5 * Fixed typos in checking python version * nbconvert is now installed bebore mknotebooks * Further tolerance adjustment for python3.5 * Updated Changelog * Clean up of travis yml file * Added Slack notifications and automatic gh-pages deployment (#78) * Updated release data and bumped version number * Add Changelog to doc (#81) * Workaround for issue #82 --- .travis.yml | 38 ++++++ CHANGELOG.md | 1 + README.md | 1 + docs/CNAME | 1 + docs/changelog.md | 10 ++ docs/demo/SampCompDB_usage.ipynb | 197 ++++++++++++++++++------------- docs/demo/SampComp_usage.ipynb | 61 ++++------ docs/usage.md | 6 +- mkdocs.yml | 8 +- nanocompore/TxComp.py | 28 +++-- nanocompore/__init__.py | 4 +- tests/test_Integration.py | 18 ++- tests/test_SampCompDB.py | 8 +- tests/test_TxComp.py | 32 ++++- tests/test_Whitelist.py | 10 +- 15 files changed, 259 insertions(+), 164 deletions(-) create mode 100644 .travis.yml create mode 120000 CHANGELOG.md create mode 100644 docs/CNAME create mode 100644 docs/changelog.md diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..bc3f983 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,38 @@ +dist: xenial +language: python +stage: test +python: + - 3.5 + - 3.6 + - 3.7 +branches: + only: + - master + - devel +install: + - pip install 'pytest>=4.4.0' --upgrade + - pip install . +script: + - pytest +notifications: + slack: + secure: Hz3J3AluB4lN+YxRxsV93IjL0j5pevTDXflw9YN6VWH99BL7OEJxhsOGtqHLqbUDa4+XPVGgFsAXiHZPO19d4jng5uIRbj7VZNFK7gwn58S3GfyMeGOjdYh0fB/xbgYn2jYM1cWGpanwtXq5aDVEHPL2M803UMyyAM2F/HmysRdr50VFYQJQ4It77H+vajtyZGL7ztuAuY2AD10v5PctGk9JYt/EQWbYwdtCSdUNHublwiLRzvyvT/LT//MzsELuMdKzaVPQn01PMjZNerOsBszgscfBEOCSJtLaCjlQwvONWFCx9xP0kVBfMQo55XQaD9cSyswVnigzTihGaozU+149ZTWpB57Xd0tAsQ05GJIatbB+8FGKmFkFjvkxBSGXkL3AYN0engOpZyR4/KHrmKlSu2oCLbJMqvdnjlm/V41IV8UiMCgRCF/kMvHcHtgnBezV36+0eRMEWW7Vz7GAh7Eeqo3kcSTFG6OHD5kkiVoDW+S64cunHcb6k8sM+N/WxadpFV+jzzK/ovoDOA7G7gxhzgONJ8BYC2azZmRf+JfhosbRTh7PgMEi1srOlmJSZAzbE4IVJYcqOWbIdHO+v+1scYepPvzOuCXlD98UNhOOtAaMKXy9zGz2IYgbcswHOdUFnD2ei53pGz5QLJ0VLh7ZpBYfEEVB7uKbKsGuWB8= +jobs: + include: + - stage: deploy + python: 3.7 + install: + - pip install nbconvert + - pip install mkdocs pymdown-extensions mknotebooks mkdocs-material + script: + - mkdocs build + deploy: &gh-pages + provider: pages + local_dir: site + skip_cleanup: true + keep_history: true + fqdn: nanocompore.rna.rocks + on: + branch: master + github_token: + secure: "v/yqJd0NF/4vgOWFOYuaxD8bUd9XESLc/K1H0Uer/Fwc/gQqzSJZ2CsqYeoI1dxOM9dNcQDVGi4bLTx0mXLI5fwmNbI3aLbBEBD5s3KCZ3PT0woFwnorPnpNHsDPm9aDAwg4Rc7S9ljdLByKBBJTItNBi0qD8nVaiVi0pGwM2pokjefkuM/tr0yUFMBxkjGOrNnboSd9JSSF+42uBN1ajb229jCiqMJWw0Q1K+B/+4KHBKaUgM0Bwt36kuCwQaZdoybxmMkdAK5hWaM8WBTQ7QIGqgHwkXQ1RkXK4dd1wE3SxfX4UNJb9SuiTslQn3rZGOJ9EA9XEGvePhI+G8w8HmxH3ps+y4QQLGf39mFLGJZgmHMzNoBfbpL8AKhk8g+T+bF4eVaJArXLy58dLKs97gBc6XJjxnqMn/we4roTHYHa1jSlTRlXXunmM8BNLXb8zck2nGUbBrtGPUvL7ZUs3LrlPN9AKLas4swWTJoPxFym0OG/0JXMu8nc/t7bc4jn7usZxThXvsjILUCocvjnvJ98NAxrCHDjFFJ0VJzNvhSx0PY2/VtIYe9wf6/Geb0NXrDHPImj/qlGewHN9s30YmOXf8TiEOKgBR3u3M1gy2jYtzXxlD5N6xWDFFteSZbc2uCiaiPvf9F+6Bo4q0jr5gcuEik9QJ6YffkrQ5NOO18=" diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 120000 index 0000000..1bed66b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1 @@ +docs/changelog.md \ No newline at end of file diff --git a/README.md b/README.md index 6f18c52..3975be4 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![MIT license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/) [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/) +[![Build Status](https://travis-ci.com/tleonardi/nanocompore.svg?token=2uTrW9fP9RypfMALjksc&branch=master)](https://travis-ci.com/tleonardi/nanocompore) --- diff --git a/docs/CNAME b/docs/CNAME new file mode 100644 index 0000000..6ad1a20 --- /dev/null +++ b/docs/CNAME @@ -0,0 +1 @@ +nanocompore.rna.rocks diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..eacad74 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,10 @@ +# Changelog + +## [Unreleased] + +### Added +- Continuous testing with Travis CI +- Automatic deployment of docs to gh-pages + +### Fixed +- Fixed "Not enough p-values" error. Issue #68 diff --git a/docs/demo/SampCompDB_usage.ipynb b/docs/demo/SampCompDB_usage.ipynb index a81bc05..8a957af 100644 --- a/docs/demo/SampCompDB_usage.ipynb +++ b/docs/demo/SampCompDB_usage.ipynb @@ -11,31 +11,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Whether ran from the CLI or API `SampComp` creates a python object database (shelve DBM) containing the statistical analysis results.\n", - "\n", - "The API directly returns a `SampCompDB` object wrapping the shelve. It is also possible to reload the `SampCompDB` latter using the db file path prefix. `SampCompDB` also need a FASTA file to get the corresponding reference id sequence and accept an optional BED file containing genomic annotations. SampCompDB provide a large selection of simple high level function to plot and export the results.\n", + "`SampComp` creates a python object database (shelve DBM) containing the statistical analysis results. The API directly returns a `SampCompDB` object wrapping the shelve DB. It is also possible to reload the `SampCompDB` latter using the db file path prefix. `SampCompDB` also need a FASTA file to get the corresponding reference id sequence and accept an optional BED file containing genomic annotations. SampCompDB provide a large selection of simple high level function to plot and export the results.\n", "\n", "At the moment `SampCompDB` is only accessible through the python API." ] }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "## Import the package" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:14:45.955872Z", - "start_time": "2019-04-04T13:14:45.950515Z" + "end_time": "2019-04-09T08:29:14.953835Z", + "start_time": "2019-04-09T08:29:13.777126Z" }, - "hidden": true, "init_cell": true }, "outputs": [], @@ -45,22 +40,19 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "## Load the database with SampCompDB" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:14:45.971158Z", - "start_time": "2019-04-04T13:14:45.961679Z" + "end_time": "2019-04-09T08:29:14.963282Z", + "start_time": "2019-04-09T08:29:14.956360Z" }, - "hidden": true, "init_cell": true }, "outputs": [ @@ -110,22 +102,19 @@ }, { "cell_type": "markdown", - "metadata": { - "hidden": true - }, + "metadata": {}, "source": [ "### Basic initialisation" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T12:30:23.034275Z", - "start_time": "2019-04-04T12:30:22.627694Z" + "end_time": "2019-04-09T08:31:18.357396Z", + "start_time": "2019-04-09T08:31:18.001414Z" }, - "hidden": true, "scrolled": true }, "outputs": [ @@ -190,22 +179,19 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "### save_report" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:14:45.995943Z", - "start_time": "2019-04-04T13:14:45.978331Z" + "end_time": "2019-04-09T08:29:14.973358Z", + "start_time": "2019-04-09T08:29:14.965326Z" }, - "hidden": true, "init_cell": true }, "outputs": [ @@ -244,8 +230,7 @@ "ExecuteTime": { "end_time": "2019-04-04T11:20:37.189459Z", "start_time": "2019-04-04T11:20:36.447276Z" - }, - "hidden": true + } }, "outputs": [ { @@ -278,22 +263,19 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "### save_shift_stats" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:14:46.023214Z", - "start_time": "2019-04-04T13:14:46.001448Z" + "end_time": "2019-04-09T08:29:14.984020Z", + "start_time": "2019-04-09T08:29:14.976504Z" }, - "hidden": true, "init_cell": true }, "outputs": [ @@ -332,8 +314,7 @@ "ExecuteTime": { "end_time": "2019-04-04T11:21:26.155275Z", "start_time": "2019-04-04T11:21:25.428657Z" - }, - "hidden": true + } }, "outputs": [ { @@ -373,11 +354,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:14:46.051574Z", - "start_time": "2019-04-04T13:14:46.029914Z" + "end_time": "2019-04-09T08:29:14.992413Z", + "start_time": "2019-04-09T08:29:14.985560Z" }, "init_cell": true, "scrolled": true @@ -467,7 +448,7 @@ } }, "source": [ - "`SampCompDB` comes with a range of methods to visualise the data and explore candidate.\n", + "`SampCompDB` comes with a range of methods to visualise the data and explore candidates.\n", "\n", "* **`plot_pvalue`**: Plot the `-log(10)` of the pvalues obtained for all the statistical methods at reference level\n", "* **`plot_signal`**: Generate comparative plots of both median intensity and dwell time for each condition at read level \n", @@ -480,7 +461,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Extra import for the plotting library" + "### Extra imports for the plotting library" ] }, { @@ -492,11 +473,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:14:46.061973Z", - "start_time": "2019-04-04T13:14:46.055803Z" + "end_time": "2019-04-09T08:29:15.003057Z", + "start_time": "2019-04-09T08:29:14.994224Z" }, "init_cell": true }, @@ -508,22 +489,19 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "### plot_pvalue" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:14:46.084314Z", - "start_time": "2019-04-04T13:14:46.066396Z" + "end_time": "2019-04-09T08:29:15.013829Z", + "start_time": "2019-04-09T08:29:15.004501Z" }, - "hidden": true, "init_cell": true }, "outputs": [ @@ -556,7 +534,7 @@ "\n", "* **figsize** *: tuple of 2 int (default = (30, 10))*\n", "\n", - "length and heigh of the output plot\n", + "Length and heigh of the output plot\n", "\n", "* **palette** *: str (default = Set2)*\n", "\n", @@ -583,6 +561,15 @@ "jhelp(SampCompDB.plot_pvalue)" ] }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "#### Examples from simulated dataset" + ] + }, { "cell_type": "code", "execution_count": 25, @@ -641,6 +628,15 @@ "fig, ax = db.plot_pvalue (\"ref_0001\", palette=\"Set1\")" ] }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true + }, + "source": [ + "#### Example from real yeast dataset with extended sequence context" + ] + }, { "cell_type": "code", "execution_count": 35, @@ -681,14 +677,15 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:14:46.095696Z", - "start_time": "2019-04-04T13:14:46.088759Z" + "end_time": "2019-04-09T08:29:15.023983Z", + "start_time": "2019-04-09T08:29:15.016069Z" }, "hidden": true, - "init_cell": true + "init_cell": true, + "scrolled": false }, "outputs": [ { @@ -724,7 +721,7 @@ "\n", "* **figsize** *: tuple of 2 int (default = (30, 10))*\n", "\n", - "length and heigh of the output plot\n", + "Length and heigh of the output plot\n", "\n", "* **palette** *: str (default = Set2)*\n", "\n", @@ -747,6 +744,16 @@ "jhelp(SampCompDB.plot_signal)" ] }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true + }, + "source": [ + "#### Examples from simulated dataset" + ] + }, { "cell_type": "code", "execution_count": 28, @@ -784,7 +791,8 @@ "end_time": "2019-04-04T13:21:12.726838Z", "start_time": "2019-04-04T13:21:07.617704Z" }, - "hidden": true + "hidden": true, + "scrolled": true }, "outputs": [ { @@ -805,6 +813,16 @@ "fig, ax = db.plot_signal (\"ref_0001\", start=100, end=125, kind=\"swarmplot\")" ] }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true + }, + "source": [ + "#### Example from real yeast dataset" + ] + }, { "cell_type": "code", "execution_count": 36, @@ -813,7 +831,8 @@ "end_time": "2019-04-04T13:28:53.825945Z", "start_time": "2019-04-04T13:28:47.621354Z" }, - "hidden": true + "hidden": true, + "scrolled": true }, "outputs": [ { @@ -836,9 +855,7 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "### plot_coverage" ] @@ -851,7 +868,7 @@ "end_time": "2019-04-04T13:10:03.748490Z", "start_time": "2019-04-04T13:10:03.714829Z" }, - "hidden": true + "scrolled": false }, "outputs": [ { @@ -902,6 +919,13 @@ "jhelp(SampCompDB.plot_coverage)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example from real yeast dataset" + ] + }, { "cell_type": "code", "execution_count": 39, @@ -909,8 +933,7 @@ "ExecuteTime": { "end_time": "2019-04-04T13:30:22.958425Z", "start_time": "2019-04-04T13:30:18.554490Z" - }, - "hidden": true + } }, "outputs": [ { @@ -933,9 +956,7 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "### plot_kmers_stats" ] @@ -947,8 +968,7 @@ "ExecuteTime": { "end_time": "2019-04-04T13:11:08.631415Z", "start_time": "2019-04-04T13:11:08.622147Z" - }, - "hidden": true + } }, "outputs": [ { @@ -999,6 +1019,13 @@ "jhelp(SampCompDB.plot_kmers_stats)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example from real yeast dataset" + ] + }, { "cell_type": "code", "execution_count": 40, @@ -1006,8 +1033,7 @@ "ExecuteTime": { "end_time": "2019-04-04T13:31:33.675275Z", "start_time": "2019-04-04T13:31:29.375927Z" - }, - "hidden": true + } }, "outputs": [ { @@ -1030,9 +1056,7 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "### plot_position " ] @@ -1045,7 +1069,6 @@ "end_time": "2019-04-04T13:11:21.723642Z", "start_time": "2019-04-04T13:11:21.714040Z" }, - "hidden": true, "scrolled": true }, "outputs": [ @@ -1125,6 +1148,13 @@ "jhelp(SampCompDB.plot_position)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example from simulated dataset" + ] + }, { "cell_type": "code", "execution_count": 41, @@ -1132,8 +1162,7 @@ "ExecuteTime": { "end_time": "2019-04-04T13:32:29.812361Z", "start_time": "2019-04-04T13:32:27.203940Z" - }, - "hidden": true + } }, "outputs": [ { diff --git a/docs/demo/SampComp_usage.ipynb b/docs/demo/SampComp_usage.ipynb index 291a662..05d7081 100644 --- a/docs/demo/SampComp_usage.ipynb +++ b/docs/demo/SampComp_usage.ipynb @@ -23,7 +23,7 @@ } }, "source": [ - "`SampComp` first parses the sample eventalign collapse files and pileups the observed results per reference at position level. Then, positions are compared using various statistical methods and the statistics are stored in a shelve DBM database containing the results for all positions with sufficient coverage. The API returns a `SampCompDB` database wrapper object that can be subsequently interrogated to extract data and plots." + "First, `SampComp` parses the sample eventalign collapse files and then the observed results are piled-up per reference at position level. In a second time, positions are compared using various statistical methods and the statistics are stored in a shelve DBM database containing the results for all positions with sufficient coverage. The API returns a `SampCompDB` database wrapper object that can be subsequently interrogated to extract data and plots." ] }, { @@ -70,18 +70,14 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "## Description of main options" ] }, { "cell_type": "markdown", - "metadata": { - "hidden": true - }, + "metadata": {}, "source": [ "`SampComp` provides a very flexible analysis framework with a few mandatory options and many optional parameters. The full CLI and API documentations are provided at the bottom of this page." ] @@ -89,15 +85,15 @@ { "cell_type": "markdown", "metadata": { - "hidden": true + "heading_collapsed": true }, "source": [ "#### Sample files\n", "\n", - "`SampComp` requires sample files obtained with `NanopolishComp EventalignCollapse` as explained before (see [data preparation](data_preparation.md)) for both the control and the experimental conditions. 2 conditions are expected, and at least 2 sample replicates per conditions are highly recommended. If `SampComp` is called through the CLI the files can be provides using either relevant command options or a YAML file. If using the Python API, one can pass either a python dictionary or a YAML file.\n", + "`SampComp` requires sample files obtained with `NanopolishComp EventalignCollapse` as explained before (see [data preparation](data_preparation.md)) for both the control and the experimental conditions. 2 conditions are expected and at least 2 replicates per conditions are highly recommended. If `SampComp` is called through the CLI the files can be provided using either relevant command options or a YAML file. With the Python API you can pass either a python dictionary or a YAML file.\n", "\n", "!!! info \"YAML file option (CLI or API)\"\n", - " This option allows to pass a YAML formatted file indicating the sample condition labels and paths to data files with the option `--sample_yaml` for the CLI or directly to `eventalign_fn_dict` for the API. The file should be formatted as follow:\n", + " This option allows to pass a YAML formatted file indicating the sample condition labels and paths to data files with the option `--sample_yaml` for the CLI or directly with `eventalign_fn_dict` for the API. The file should be formatted as follow:\n", "\n", " ```yaml\n", " WT:\n", @@ -125,9 +121,7 @@ }, { "cell_type": "markdown", - "metadata": { - "hidden": true - }, + "metadata": {}, "source": [ "#### Transcriptome reference FASTA file\n", "\n", @@ -136,31 +130,25 @@ }, { "cell_type": "markdown", - "metadata": { - "hidden": true - }, + "metadata": {}, "source": [ "#### Output folder\n", "\n", - "Although it is not mandatory, it is recommended to provide a path to a directory where the program will output the result files (CLI: `--outpath`, API: `outpath`). In addition, users can also specify a prefix for the files to be generated (CLI: `--outprefix`, API: `outprefix`). Finally, if the outpath directory already exists, the program will raise an error to prevent erasing result files accidentally. To ignore the error one as to specify to overwrite previous results (CLI: `--outprefix`, API: `outprefix`)." + "Although it is not mandatory, it is recommended to provide a path to a directory where the program will output the result files (CLI: `--outpath`, API: `outpath`). In addition, users can also specify a prefix for the files to be generated (CLI: `--outprefix`, API: `outprefix`). Finally, if the outpath directory already exists, the program will raise an error to avoid erasing result files. To ignore the error you can to specify to overwrite previous results (CLI: `--overwrite`, API: `overwrite`)." ] }, { "cell_type": "markdown", - "metadata": { - "hidden": true - }, + "metadata": {}, "source": [ "#### Genome annotation BED file\n", "\n", - "Optionally, a BED file containing the genome annotations corresponding to the transcriptome fasta file can be provided. If this file is given, Nanocompore will also convert the transcript coordinates into the genome space (CLI: `--bed`, API: `bed_fn`)" + "Optionally, a BED file containing the genome annotations corresponding to the transcriptome fasta file can be provided. In that case Nanocompore will also convert the transcript coordinates into the genome space (CLI: `--bed`, API: `bed_fn`)" ] }, { "cell_type": "markdown", - "metadata": { - "hidden": true - }, + "metadata": {}, "source": [ "#### Statistical options\n", "\n", @@ -176,25 +164,21 @@ }, { "cell_type": "markdown", - "metadata": { - "hidden": true - }, + "metadata": {}, "source": [ "#### Coverage options\n", "\n", - "The default coverage threshold for `SampComp` to perform a statistical test is 50 reads in each replicates. This is quite conservative and can be modified if needed (`min_coverage`). In addition, to reduce the computational burden it is possible to randomly down-sample the number of reads for high coverage references (`downsample_high_coverage`)." + "The default coverage threshold for `SampComp` to perform a statistical test is 30 reads in each replicates. This is quite conservative and can be modified if needed (`min_coverage`). In addition, to reduce the computational burden it is possible to randomly down-sample the number of reads for high coverage references (`downsample_high_coverage`)." ] }, { "cell_type": "markdown", - "metadata": { - "hidden": true - }, + "metadata": {}, "source": [ "#### Manually exclude or include references (API only)\n", "\n", "The API allows to specify references to be included or excluded from the analysis (`select_ref_id` and\n", - "`exclude_ref_id`). This can be useful to analyse a specific transcripts set only or to run a small test before analysing the entire dataset." + "`exclude_ref_id`). This can be useful to analyse a specific set of transcripts or to run a small test before analysing the whole dataset." ] }, { @@ -213,11 +197,11 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2019-04-04T13:22:31.706568Z", - "start_time": "2019-04-04T13:22:31.703468Z" + "end_time": "2019-04-09T08:09:31.313892Z", + "start_time": "2019-04-09T08:09:29.989120Z" }, "init_cell": true }, @@ -619,9 +603,7 @@ }, { "cell_type": "markdown", - "metadata": { - "heading_collapsed": true - }, + "metadata": {}, "source": [ "### CLI documentation " ] @@ -632,8 +614,7 @@ "ExecuteTime": { "end_time": "2019-04-03T11:45:31.904837Z", "start_time": "2019-04-03T11:45:31.878004Z" - }, - "hidden": true + } }, "source": [ "```bash\n", diff --git a/docs/usage.md b/docs/usage.md index 9e75473..c04882c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,17 +6,17 @@ Nanocompore was designed to be used either through a python API or a command lin This is the main module which compares the signal of 2 experimental conditions. It takes Nanopolished datasets as input and generates a database containing all the results as output. `SampComp` has to be run first either through the API (`nanocompore.SampComp.SampComp`) or the CLI (`nanocompore sampcomp`). -* [SampComp Usage](https://github.com/tleonardi/nanocompore/blob/master/docs/demo/SampComp_usage.ipynb) +* [SampComp Usage](https://nanocompore.rna.rocks/demo/SampComp_usage/) ### SampCompDB `SampCompDB` is a wrapper around the DBM object database generated by `SampComp`. This module performs secondary statistical analyses and provide simple high level functions to plot, explore and export the results. At the moment `SampCompDB` is only accessible through the interactive python API (`nanocompore.SampCompDB.SampCompDB`). We strongly recommend to use [jupyter notebook](https://jupyter.org/). -* [SampCompDB Usage](https://github.com/tleonardi/nanocompore/blob/master/docs/demo/SampComp_usage.ipynb) +* [SampCompDB Usage](https://nanocompore.rna.rocks/demo/SampCompDB_usage/) ### SimReads This module can be used to generate artificial datasets based on a model file obtained from IVT generated RNA sequenced by direct RNA sequencing ([Datasets](https://github.com/nanopore-wgs-consortium/NA12878/blob/master/nanopore-human-transcriptome/fastq_fast5_bulk.md), from the Nanopore RNA consortium). In addition, one can also simulate the presence of modifications by allowing to deviate from the model for selected positions. -* [Simulate_reads Usage](https://github.com/tleonardi/nanocompore/blob/master/docs/demo/SimReads_usage.ipynb) +* [Simulate_reads Usage](https://nanocompore.rna.rocks/demo/SimReads_usage) diff --git a/mkdocs.yml b/mkdocs.yml index e98f17c..253a543 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -29,10 +29,10 @@ nav: - Generate plots and tables with SampCompDB: demo/SampCompDB_usage.ipynb - Generate simulated reads: demo/SimReads_usage.ipynb - Alternative and complementary packages: alternative.md - - Getting involved and finding help: - - Contributing: contributing.md - - Code of conduct: code_of_conduct.md - - Licence: licence.md + - Contributing: contributing.md + - Code of conduct: code_of_conduct.md + - Licence: licence.md + - Changelog: changelog.md # Theme customization theme: diff --git a/nanocompore/TxComp.py b/nanocompore/TxComp.py index 1f11c41..57cf96c 100644 --- a/nanocompore/TxComp.py +++ b/nanocompore/TxComp.py @@ -104,32 +104,37 @@ def txCompare( for pos_dict in ref_pos_list: if 'txComp' in pos_dict: for test in tests: - if test in pos_dict['txComp']: - pval_list_dict[test].append(pos_dict['txComp'][test]) + pval_list_dict[test].append(pos_dict['txComp'][test]) + elif pos_dict["lowCov"]: + for test in tests: + pval_list_dict[test].append(np.nan) # Compute cross correlation matrix per test corr_matrix_dict = OrderedDict() for test in tests: corr_matrix_dict[test] = cross_corr_matrix(pval_list_dict[test], sequence_context) logger.debug("Combine adjacent position pvalues with Hou's method position per position") - # Iterate over each positions in previously generated result dictionnary + # Iterate over each positions in previously generated result dictionary for mid_pos in range(len(ref_pos_list)): # Perform test only if middle pos is valid if not ref_pos_list[mid_pos]["lowCov"]: pval_list_dict = defaultdict(list) for pos in range(mid_pos-sequence_context, mid_pos+sequence_context+1): - # If any the positions is missing or any of the pvalues in the context is lowCov or NaN, consider it 1 - if pos < 0 or pos >= len(ref_pos_list) or ref_pos_list[pos]["lowCov"]: - for test in tests: + for test in tests: + # If any of the positions is missing or any of the pvalues in the context is lowCov or NaN, consider it 1 + if pos < 0 or pos >= len(ref_pos_list) or ref_pos_list[pos]["lowCov"] or np.isnan(ref_pos_list[pos]["txComp"][test]): pval_list_dict[test].append(1) - # else just extract the corresponding pvalue - else: - for test in tests: + # else just extract the corresponding pvalue + else: pval_list_dict[test].append(ref_pos_list[pos]["txComp"][test]) - # Combine collected pvalues add add to dict + # Combine collected pvalues and add to dict for test in tests: test_label = "{}_context_{}".format(test, sequence_context) - ref_pos_list[mid_pos]['txComp'][test_label] = combine_pvalues_hou(pval_list_dict[test], weights, corr_matrix_dict[test]) + # If the mid p-value is.nan, force to nan also the context p-value + if np.isnan(ref_pos_list[mid_pos]["txComp"][test]): + ref_pos_list[mid_pos]['txComp'][test_label] = np.nan + else: + ref_pos_list[mid_pos]['txComp'][test_label] = combine_pvalues_hou(pval_list_dict[test], weights, corr_matrix_dict[test]) return ref_pos_list @@ -337,6 +342,7 @@ def cross_corr_matrix(pvalues_vector, context=2): def combine_pvalues_hou(pvalues, weights, cor_mat): """ Hou's method for the approximation for the distribution of the weighted combination of non-independent or independent probabilities. + If any pvalue is nan, returns nan. https://doi.org/10.1016/j.spl.2004.11.028 pvalues: list of pvalues to be combined weights: the weights of the pvalues diff --git a/nanocompore/__init__.py b/nanocompore/__init__.py index a09818a..9c99dcc 100755 --- a/nanocompore/__init__.py +++ b/nanocompore/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Define self package variable -__version__ = "1.0.0rc1" +__version__ = "1.0.0rc2" description = 'Software package that identifies raw signal changes between two conditions from https://github.com/jts/nanopolish resquiggled dRNA-Seq data.' # Collect info in a dictionary for setup.py @@ -13,7 +13,7 @@ "author": 'Tommaso Leonardi and Adrien Leger', "author_email": 'tom {at} tleo.io / aleg {at} ebi.ac.uk', "license": "MIT", - "python_requires":'>=3.3', + "python_requires":'>=3.5', "classifiers": [ 'Development Status :: 3 - Alpha', 'Intended Audience :: Science/Research', diff --git a/tests/test_Integration.py b/tests/test_Integration.py index 901c354..c76deb3 100644 --- a/tests/test_Integration.py +++ b/tests/test_Integration.py @@ -8,21 +8,25 @@ import hashlib import sys import random +from os import environ + +# Check if the tests are running inside Travis +travis = True if 'TRAVIS' in os.environ else False @pytest.fixture(scope="module") -def fasta_file(tmpdir_factory): - fasta_file = tmpdir_factory.mktemp("fasta").join("reference.fa") +def fasta_file(tmp_path_factory): + fasta_file = tmp_path_factory.mktemp("fasta") / "reference.fa" random.seed(42) - with open(fasta_file, 'w') as f: + with open(str(fasta_file), 'w') as f: for n in range(0,1): f.write('>Ref_00{}\n'.format(n)) f.write("".join([random.choice("ACGT") for _ in range(0,random.randint(100, 2000))])+"\n") return(str(fasta_file)) @pytest.fixture(scope="module") -def nanopolishcomp_test_files(tmpdir_factory, fasta_file): +def nanopolishcomp_test_files(tmp_path_factory, fasta_file): """ Generate simulated data with SimReads() """ - tmp_path=tmpdir_factory.mktemp("generated_data") + tmp_path=tmp_path_factory.mktemp("generated_data") data_rand_seed=869 fn_dict={'S1':{}, 'S2':{}} for rep in [1,2,3,4]: @@ -44,7 +48,7 @@ def nanopolishcomp_test_files(tmpdir_factory, fasta_file): not_bound=True, log_level="debug", overwrite=True) - + SimReads ( fasta_fn=fasta_file, outpath=str(tmp_path), @@ -70,6 +74,8 @@ def nanopolishcomp_test_files(tmpdir_factory, fasta_file): @pytest.mark.parametrize("context", [2,3]) @pytest.mark.parametrize("context_weight", ["uniform", "harmonic"]) def test_sig_sites(nanopolishcomp_test_files, method, context, context_weight): + if travis and (method != "GMM" or context != 2 or context_weight != "uniform"): + pytest.skip() fasta_file, fn_dict, tmp_path = nanopolishcomp_test_files s = SampComp(eventalign_fn_dict=fn_dict, outpath=tmp_path, diff --git a/tests/test_SampCompDB.py b/tests/test_SampCompDB.py index 122ece9..30c6879 100644 --- a/tests/test_SampCompDB.py +++ b/tests/test_SampCompDB.py @@ -5,16 +5,16 @@ tol=10e-6 @pytest.mark.parametrize("pvalues", [ - ( - [0.1,0.2,0.3,0.5], + ( + [0.1,0.2,0.3,0.5], [0.4, 0.4, 0.4, 0.5] ), ( - [0.1, 0.01, np.nan, 0.01, 0.5, 0.4, 0.01, 0.001, np.nan, np.nan, 0.01, np.nan], + [0.1, 0.01, np.nan, 0.01, 0.5, 0.4, 0.01, 0.001, np.nan, np.nan, 0.01, np.nan], [0.13333333, 0.016, np.nan, 0.016, 0.5, 0.45714286, 0.016, 0.008, np.nan, np.nan, 0.016, np.nan] ), ( - [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan] ), ( diff --git a/tests/test_TxComp.py b/tests/test_TxComp.py index 3b2a70e..169828e 100644 --- a/tests/test_TxComp.py +++ b/tests/test_TxComp.py @@ -3,6 +3,7 @@ from scipy.stats import combine_pvalues import numpy as np from unittest import mock +import sys @pytest.mark.parametrize("pvalues", [ @@ -32,6 +33,7 @@ def test_combine_pvalues_raises_exception_with_invalid_pvalues(pvalues): with pytest.raises(NanocomporeError): combine_pvalues_hou(pvalues, weights, cor_mat) + @pytest.mark.parametrize("v1, v2, expected", [ ( (1,4,3,2,9), (1,7,8,8,5), (0.4619, 0.3277, 0.3291)) ]) @@ -86,17 +88,20 @@ def test_ref_pos_list(): } # These expected values have been checked against the R implementations of aov() and glm(family="binomial") expected = {'GMM_anova': [0.0008574768473501677, 0.0036329291397528157, 0.007312047981252302, 0.0010906844025473576, np.nan, 0.004197519576768562, 0.004730678586860965, 0.0028228474915020945, 0.0023262178697710987, 0.00020764199465021126], - 'GMM_logit': [1.274245328765287e-39, 3.3968653938213694e-40, 1.9321679678623975e-36, 8.482777798353687e-40, np.nan, 7.06503867181238e-40, 1.839272092115274e-40, 9.162002495725215e-32, 5.922884891638699e-34, 3.1972432623454785e-40] + 'GMM_logit': [1.274245328765287e-39, 3.3968653938213694e-40, 1.9321679678623975e-36, 8.482777798353687e-40, np.nan, 7.06503867181238e-40, 1.839272092115274e-40, 9.162002495725215e-32, 5.922884891638699e-34, 3.1972432623454785e-40] } return((test_ref_pos_list, expected)) def test_txComp_GMM_anova(test_ref_pos_list): ml = mock.Mock() - tol=0.00000001 + if sys.version_info < (3, 6): + tol = 0.001 + else: + tol=0.00000001 res = txCompare(test_ref_pos_list[0], methods=['GMM'], logit=False, sequence_context=2, min_coverage=3, logger=ml, allow_warnings=False, random_state=np.random.RandomState(seed=42)) GMM_pvalues = [pos['txComp']['GMM_anova_pvalue'] for pos in res ] - assert GMM_pvalues == [pytest.approx(i, abs=tol, nan_ok=True) for i in test_ref_pos_list[1]['GMM_anova']] + assert GMM_pvalues == [pytest.approx(i, abs=tol, nan_ok=True) for i in test_ref_pos_list[1]['GMM_anova']] def test_txComp_GMM_logit(test_ref_pos_list): ml = mock.Mock() @@ -107,7 +112,7 @@ def test_txComp_GMM_logit(test_ref_pos_list): print(test_ref_pos_list[1]['GMM_logit']) print([pos['txComp']['GMM_model']['cluster_counts'] for pos in res ]) - assert GMM_logit == [pytest.approx(i, abs=tol, nan_ok=True) for i in test_ref_pos_list[1]['GMM_logit']] + assert GMM_logit == [pytest.approx(i, abs=tol, nan_ok=True) for i in test_ref_pos_list[1]['GMM_logit']] @pytest.fixture def test_ref_pos_list_0_var(): @@ -149,7 +154,6 @@ def test_ref_pos_list_0_var(): def test_txComp_GMM_anova_0_var(test_ref_pos_list_0_var): ml = mock.Mock() - tol=0.000000001 with pytest.raises(NanocomporeError): txCompare(test_ref_pos_list_0_var, methods=['GMM'], logit=False, sequence_context=2, min_coverage=3, logger=ml, allow_warnings=False, random_state=np.random.RandomState(seed=42)) @@ -189,3 +193,21 @@ def test_txComp_GMM_dup_lab(test_ref_pos_list_dup_lab): ml = mock.Mock() with pytest.raises(NanocomporeError): txCompare(test_ref_pos_list_dup_lab, methods=['GMM'], logit=False, sequence_context=2, min_coverage=3, logger=ml, allow_warnings=False, random_state=np.random.RandomState(seed=42)) + +def test_txComp_lowCov(test_ref_pos_list): + """ This test ensures that txCompare runs also when the number of covered positions + in a reference is below the threshold + """ + test_ref_pos_list = test_ref_pos_list[0] + low_cov_positions = [0,1,5] + for pos in low_cov_positions: + test_ref_pos_list[pos]['data']['WT']['WT1']['coverage'] = 1 + ml = mock.Mock() + results = txCompare(test_ref_pos_list, methods=['GMM'], logit=False, sequence_context=2, min_coverage=30, logger=ml, allow_warnings=False, random_state=np.random.RandomState(seed=42)) + for pos in results: + if 'txComp' in pos: + # If the original p-value was nan, the context p-value also has to be nan + if np.isnan(pos['txComp']['GMM_anova_pvalue']): + assert np.isnan(pos['txComp']['GMM_anova_pvalue_context_2']) + else: + assert not np.isnan(pos['txComp']['GMM_anova_pvalue_context_2']) diff --git a/tests/test_Whitelist.py b/tests/test_Whitelist.py index 1661344..bc5a0f2 100644 --- a/tests/test_Whitelist.py +++ b/tests/test_Whitelist.py @@ -8,9 +8,9 @@ @pytest.fixture(scope="module") -def fasta_file(tmpdir_factory): - fasta_file = tmpdir_factory.mktemp("fasta").join("reference.fa") - with open(fasta_file, 'w') as f: +def fasta_file(tmp_path_factory): + fasta_file = tmp_path_factory.mktemp("fasta") / "reference.fa" + with open(str(fasta_file), 'w') as f: f.write('>Ref_001\n') f.write('A'*1000+'\n') f.write('>Ref_002\n') @@ -18,8 +18,8 @@ def fasta_file(tmpdir_factory): return(str(fasta_file)) @pytest.fixture(scope="module") -def nanopolishcomp_test_files(tmpdir_factory, fasta_file): - tmp_path=tmpdir_factory.mktemp("generated_data") +def nanopolishcomp_test_files(tmp_path_factory, fasta_file): + tmp_path=tmp_path_factory.mktemp("generated_data") SimReads(fasta_fn=fasta_file, outpath=str(tmp_path), outprefix="reads", overwrite=True) fn_dict={"S1": { "R1": str(tmp_path / "reads.tsv"),