From edecd98ad1698b72014976fb36e1d6ab5efe01b4 Mon Sep 17 00:00:00 2001 From: Malte Benedikt Kuehl Date: Wed, 27 Nov 2024 16:59:42 +0100 Subject: [PATCH] Improve test coverage, fix abundance recomputing bug after biotype filtering and fix oarfish import bug when colunns have the same name --- pyproject.toml | 2 +- pytximport/_cli.py | 6 +-- pytximport/core/_tximport.py | 8 +++- pytximport/importers/_read_tsv.py | 2 +- pytximport/utils/_filter_by_biotype.py | 12 ++--- test/data/piscem/res_oarfish_naming.quant | 15 +++++++ test/test_biotype_filter.py | 54 ++++++++++++++++------- test/test_correctness.py | 2 +- test/test_piscem.py | 21 +++++++-- test/test_transcript_level.py | 27 +++++++++--- 10 files changed, 109 insertions(+), 40 deletions(-) create mode 100644 test/data/piscem/res_oarfish_naming.quant diff --git a/pyproject.toml b/pyproject.toml index e958c87..8600bc0 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,7 +114,7 @@ markers = [ [tool.coverage.run] source = ["pytximport"] -omit = ["*/test/*"] +omit = ["*/test/*", "**/*/_cli.py"] [tool.coverage.report] exclude_lines = [ diff --git a/pytximport/_cli.py b/pytximport/_cli.py index f4a3f73..2d670a6 100644 --- a/pytximport/_cli.py +++ b/pytximport/_cli.py @@ -18,7 +18,7 @@ help="Welcome to the pytximport command-line interface for importing transcript-level quantification files.", ) @click.pass_context -def cli( # type: ignore +def cli( # type: ignore # pragma: no cover ctx: click.Context, ): """Welcome to the pytximport command-line interface for importing transcript-level quantification files.""" @@ -160,7 +160,7 @@ def cli( # type: ignore is_flag=True, help="Whether the existence of the files is optional.", ) -def run( # type: ignore +def run( # type: ignore # pragma: no cover **kwargs, ) -> None: """Call the tximport function via the command line.""" @@ -221,7 +221,7 @@ def run( # type: ignore is_flag=True, help="Provide this flag to keep the gene_biotype column as an additional column in the mapping file.", ) -def create_map( # type: ignore +def create_map( # type: ignore # pragma: no cover **kwargs, ) -> None: """Create a transcript-to-gene mapping file via the command line.""" diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py index e731b29..6d76c4b 100644 --- a/pytximport/core/_tximport.py +++ b/pytximport/core/_tximport.py @@ -114,7 +114,9 @@ def tximport( biotype_filter (List[str], optional): Filter the transcripts by biotype, including only those provided. Enables post-hoc filtering of the data based on the biotype of the transcripts. Assumes that the biotype is present in the transcript_id of the data, bar-separated. If this is not the case, please use the `filter_by_biotype` - function from the `pytximport.utils` module instead. Defaults to None. + function from the `pytximport.utils` module instead. Please note that the abundance will NOT be recalculated + after filtering to avoid introducing bias. If you wish to recalculate the abundance, please use the + `filter_by_biotype` function from the `pytximport.utils` module instead. Defaults to None. Returns: Union[xr.Dataset, ad.AnnData, SummarizedExperiment, None]: The estimated gene-level or transcript-level @@ -409,7 +411,9 @@ def tximport( if biotype_filter is not None: transcript_data = filter_by_biotype( - transcript_data, biotype_filter=biotype_filter, id_column=("gene_id" if gene_level else "transcript_id") + transcript_data, + biotype_filter=biotype_filter, + id_column=("gene_id" if gene_level else "transcript_id"), ) # Remove appended gene names after underscore for RSEM data for both transcript and gene ids diff --git a/pytximport/importers/_read_tsv.py b/pytximport/importers/_read_tsv.py index 044bcdd..67e2c61 100644 --- a/pytximport/importers/_read_tsv.py +++ b/pytximport/importers/_read_tsv.py @@ -108,7 +108,7 @@ def read_tsv( usecols.append(counts_column) dtype[counts_column] = np.float64 - if abundance_column is not None: + if abundance_column is not None and abundance_column not in usecols: usecols.append(abundance_column) dtype[abundance_column] = np.float64 diff --git a/pytximport/utils/_filter_by_biotype.py b/pytximport/utils/_filter_by_biotype.py index 44f637c..a2c418b 100644 --- a/pytximport/utils/_filter_by_biotype.py +++ b/pytximport/utils/_filter_by_biotype.py @@ -57,9 +57,11 @@ def filter_by_biotype( # quantification tools include the biotype in the transcript_id # This only works if the transcript_id contains the biotype as one of the bar-separated fields and is not the # best way to filter the transcripts by biotype - assert any( - "|" in transcript_id for transcript_id in transcript_ids - ), "The transcript_id does not contain the biotype." + assert any("|" in transcript_id for transcript_id in transcript_ids), ( + "The transcript_id column does not contain the biotype. Please use the `pytximport.utils.filter_by_biotype`" + " function with the `transcript_gene_map` argument instead. This function can be called after the" + " `tximport` function using the resulting AnnData object or xarray Dataset." + ) transcript_id_fields = [transcript_id.split("|") for transcript_id in transcript_ids] @@ -99,7 +101,7 @@ def filter_by_biotype( elif isinstance(transcript_data, ad.AnnData): # Calculate the total abundance before filtering - total_abundance = transcript_data.obsm["abundance"].sum(axis=0) + total_abundance = transcript_data.obsm["abundance"].sum(axis=1) transcript_data = transcript_data[:, transcript_keep_boolean] log( @@ -112,7 +114,7 @@ def filter_by_biotype( log(25, "Recalculating the abundance after filtering by biotype.") if isinstance(transcript_data, ad.AnnData): - new_abundance = transcript_data.obsm["abundance"].sum(axis=0) + new_abundance = transcript_data.obsm["abundance"].sum(axis=1) ratio = total_abundance / new_abundance transcript_data.obsm["abundance"] = (transcript_data.obsm["abundance"].T * ratio).T elif isinstance(transcript_data, xr.Dataset): diff --git a/test/data/piscem/res_oarfish_naming.quant b/test/data/piscem/res_oarfish_naming.quant new file mode 100644 index 0000000..724b3fc --- /dev/null +++ b/test/data/piscem/res_oarfish_naming.quant @@ -0,0 +1,15 @@ +tname len num_reads +ENST00000513300.5 1924 102.328 +ENST00000282507.7 2355 1592.02 +ENST00000504685.5 1476 68.6528 +ENST00000243108.4 1733 343.499 +ENST00000303450.4 1516 664 +ENST00000243082.4 2039 55 +ENST00000303406.4 1524 304.189 +ENST00000303460.4 1936 47 +ENST00000243056.4 2423 42 +ENST00000312492.2 1805 228 +ENST00000040584.5 1889 4295 +ENST00000430889.2 1666 623.628 +ENST00000394331.3 2943 85.6842 +ENST00000243103.3 3335 962 diff --git a/test/test_biotype_filter.py b/test/test_biotype_filter.py index 6cf932a..53e24fc 100644 --- a/test/test_biotype_filter.py +++ b/test/test_biotype_filter.py @@ -4,6 +4,7 @@ from typing import List import anndata as ad +import numpy as np import xarray as xr from pytximport import tximport @@ -31,22 +32,41 @@ def test_biotype_filter( return_transcript_data=transcript_level, ) - id_column = "transcript_id" if transcript_level else "gene_id" + for recalculate_abundance in [True, False]: + id_column = "transcript_id" if transcript_level else "gene_id" - result_filtered = filter_by_biotype( - result, - transcript_gene_map_mouse, - biotype_filter=["protein_coding"], - id_column=id_column, - ) + result_filtered = filter_by_biotype( + result, + transcript_gene_map_mouse, + biotype_filter=["protein_coding"], + id_column=id_column, + recalculate_abundance=recalculate_abundance, + ) + + if output_type == "anndata": + assert isinstance(result_filtered, ad.AnnData) + assert len(result_filtered.var_names) > 0, "No genes were retained." + assert len(result_filtered.var_names) < len(result.var_names), "All genes were retained." + + if recalculate_abundance: + np.testing.assert_allclose( + result_filtered.obsm["abundance"].sum(axis=1), + result.obsm["abundance"].sum(axis=1), + rtol=1e-4, + atol=1, + ) + + else: + assert isinstance(result_filtered, xr.Dataset) + assert len(result_filtered.coords[id_column]) > 0, "No genes were retained." + assert len(result_filtered.coords[id_column]) < len( + result.coords[id_column] + ), "All genes were retained." - if output_type == "anndata": - assert isinstance(result_filtered, ad.AnnData) - assert len(result_filtered.var_names) > 0, "No genes were retained." - assert len(result_filtered.var_names) < len(result.var_names), "All genes were retained." - else: - assert isinstance(result_filtered, xr.Dataset) - assert len(result_filtered.coords[id_column]) > 0, "No genes were retained." - assert len(result_filtered.coords[id_column]) < len( - result.coords[id_column] - ), "All genes were retained." + if recalculate_abundance: + np.testing.assert_allclose( + result_filtered["abundance"].sum(axis=0), + result["abundance"].sum(axis=0), + rtol=1e-4, + atol=1, + ) diff --git a/test/test_correctness.py b/test/test_correctness.py index 7809fa4..2f2cf3c 100644 --- a/test/test_correctness.py +++ b/test/test_correctness.py @@ -188,7 +188,7 @@ def test_correctness_inferential_replicates( ignore_transcript_version=True, ignore_after_bar=True, output_type="xarray", - counts_from_abundance=None, # type: ignore + counts_from_abundance=None, ) assert isinstance(result, xr.Dataset), "The result is not an xarray Dataset." diff --git a/test/test_piscem.py b/test/test_piscem.py index 228a3b2..69de374 100644 --- a/test/test_piscem.py +++ b/test/test_piscem.py @@ -1,14 +1,11 @@ """Test importing salmon quantification files.""" from pathlib import Path -from typing import List import anndata as ad import pandas as pd -import xarray as xr from pytximport import tximport -from pytximport.utils import biotype_filters def test_piscem( @@ -28,7 +25,23 @@ def test_piscem( output_type="anndata", ignore_transcript_version=True, ignore_after_bar=True, - counts_from_abundance=None, # type: ignore + counts_from_abundance=None, + ) + + # Check that the result is an AnnData object + assert isinstance(result, ad.AnnData) + + # Check that the counts.data are all positive + assert (result.X >= 0).all() + + result = tximport( + [piscem_file.parent / "res_oarfish_naming.quant"], + "oarfish", + transcript_gene_mapping_human, + output_type="anndata", + ignore_transcript_version=True, + ignore_after_bar=True, + counts_from_abundance=None, ) # Check that the result is an AnnData object diff --git a/test/test_transcript_level.py b/test/test_transcript_level.py index 8b347b1..80c561b 100644 --- a/test/test_transcript_level.py +++ b/test/test_transcript_level.py @@ -5,6 +5,7 @@ import anndata as ad import pandas as pd +import xarray as xr from pytximport import tximport from pytximport.utils import biotype_filters, replace_transcript_ids_with_names @@ -22,7 +23,7 @@ def test_salmon_transcript_level( transcript_name_mapping_human (pd.DataFrame): The mapping of transcript ids to transcript names. transcript_name_mapping_human_path (Path): The path to the transcript name mapping. """ - result = tximport( + result_ad = tximport( [salmon_file], "salmon", return_transcript_data=True, @@ -32,14 +33,14 @@ def test_salmon_transcript_level( counts_from_abundance="scaled_tpm", ) - assert isinstance(result, ad.AnnData) - counts = result.X + assert isinstance(result_ad, ad.AnnData) + counts = result_ad.X # Check that the counts.data are all positive assert (counts >= 0).all() # replace the transcript ids with the transcript names - result_df = replace_transcript_ids_with_names(result, transcript_name_mapping_human) + result_df = replace_transcript_ids_with_names(result_ad, transcript_name_mapping_human) # Check that the result is an AnnData object assert isinstance(result_df, ad.AnnData) @@ -55,10 +56,24 @@ def test_salmon_transcript_level( ) # Check that it also works with a path to the transcript name mapping - result_path = replace_transcript_ids_with_names(result, transcript_name_mapping_human_path) + result_replaced_ad = replace_transcript_ids_with_names(result_ad, transcript_name_mapping_human_path) # Check that the results are the same - pd.testing.assert_frame_equal(result_df.var_names.to_frame(), result_path.var_names.to_frame()) + pd.testing.assert_frame_equal(result_df.var_names.to_frame(), result_replaced_ad.var_names.to_frame()) + + # Check with an xarray Dataset + result_xr = tximport( + [salmon_file], + "salmon", + return_transcript_data=True, + output_type="xarray", + ignore_transcript_version=True, + ignore_after_bar=True, + counts_from_abundance="scaled_tpm", + ) + + result_replaced_xr = replace_transcript_ids_with_names(result_xr, transcript_name_mapping_human) + assert isinstance(result_replaced_xr, xr.Dataset) def test_dtu_scaled_tpm(