From edecd98ad1698b72014976fb36e1d6ab5efe01b4 Mon Sep 17 00:00:00 2001
From: Malte Benedikt Kuehl <malte.kuehl@clin.au.dk>
Date: Wed, 27 Nov 2024 16:59:42 +0100
Subject: [PATCH] Improve test coverage, fix abundance recomputing bug after
 biotype filtering and fix oarfish import bug when colunns have the same name

---
 pyproject.toml                            |  2 +-
 pytximport/_cli.py                        |  6 +--
 pytximport/core/_tximport.py              |  8 +++-
 pytximport/importers/_read_tsv.py         |  2 +-
 pytximport/utils/_filter_by_biotype.py    | 12 ++---
 test/data/piscem/res_oarfish_naming.quant | 15 +++++++
 test/test_biotype_filter.py               | 54 ++++++++++++++++-------
 test/test_correctness.py                  |  2 +-
 test/test_piscem.py                       | 21 +++++++--
 test/test_transcript_level.py             | 27 +++++++++---
 10 files changed, 109 insertions(+), 40 deletions(-)
 create mode 100644 test/data/piscem/res_oarfish_naming.quant

diff --git a/pyproject.toml b/pyproject.toml
index e958c87..8600bc0 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -114,7 +114,7 @@ markers = [
 
 [tool.coverage.run]
 source = ["pytximport"]
-omit = ["*/test/*"]
+omit = ["*/test/*", "**/*/_cli.py"]
 
 [tool.coverage.report]
 exclude_lines = [
diff --git a/pytximport/_cli.py b/pytximport/_cli.py
index f4a3f73..2d670a6 100644
--- a/pytximport/_cli.py
+++ b/pytximport/_cli.py
@@ -18,7 +18,7 @@
     help="Welcome to the pytximport command-line interface for importing transcript-level quantification files.",
 )
 @click.pass_context
-def cli(  # type: ignore
+def cli(  # type: ignore  # pragma: no cover
     ctx: click.Context,
 ):
     """Welcome to the pytximport command-line interface for importing transcript-level quantification files."""
@@ -160,7 +160,7 @@ def cli(  # type: ignore
     is_flag=True,
     help="Whether the existence of the files is optional.",
 )
-def run(  # type: ignore
+def run(  # type: ignore  # pragma: no cover
     **kwargs,
 ) -> None:
     """Call the tximport function via the command line."""
@@ -221,7 +221,7 @@ def run(  # type: ignore
     is_flag=True,
     help="Provide this flag to keep the gene_biotype column as an additional column in the mapping file.",
 )
-def create_map(  # type: ignore
+def create_map(  # type: ignore  # pragma: no cover
     **kwargs,
 ) -> None:
     """Create a transcript-to-gene mapping file via the command line."""
diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py
index e731b29..6d76c4b 100644
--- a/pytximport/core/_tximport.py
+++ b/pytximport/core/_tximport.py
@@ -114,7 +114,9 @@ def tximport(
         biotype_filter (List[str], optional): Filter the transcripts by biotype, including only those provided. Enables
             post-hoc filtering of the data based on the biotype of the transcripts. Assumes that the biotype is present
             in the transcript_id of the data, bar-separated. If this is not the case, please use the `filter_by_biotype`
-            function from the `pytximport.utils` module instead. Defaults to None.
+            function from the `pytximport.utils` module instead. Please note that the abundance will NOT be recalculated
+            after filtering to avoid introducing bias. If you wish to recalculate the abundance, please use the
+            `filter_by_biotype` function from the `pytximport.utils` module instead. Defaults to None.
 
     Returns:
         Union[xr.Dataset, ad.AnnData, SummarizedExperiment, None]: The estimated gene-level or transcript-level
@@ -409,7 +411,9 @@ def tximport(
 
     if biotype_filter is not None:
         transcript_data = filter_by_biotype(
-            transcript_data, biotype_filter=biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
+            transcript_data,
+            biotype_filter=biotype_filter,
+            id_column=("gene_id" if gene_level else "transcript_id"),
         )
 
     # Remove appended gene names after underscore for RSEM data for both transcript and gene ids
diff --git a/pytximport/importers/_read_tsv.py b/pytximport/importers/_read_tsv.py
index 044bcdd..67e2c61 100644
--- a/pytximport/importers/_read_tsv.py
+++ b/pytximport/importers/_read_tsv.py
@@ -108,7 +108,7 @@ def read_tsv(
         usecols.append(counts_column)
         dtype[counts_column] = np.float64
 
-        if abundance_column is not None:
+        if abundance_column is not None and abundance_column not in usecols:
             usecols.append(abundance_column)
             dtype[abundance_column] = np.float64
 
diff --git a/pytximport/utils/_filter_by_biotype.py b/pytximport/utils/_filter_by_biotype.py
index 44f637c..a2c418b 100644
--- a/pytximport/utils/_filter_by_biotype.py
+++ b/pytximport/utils/_filter_by_biotype.py
@@ -57,9 +57,11 @@ def filter_by_biotype(
         # quantification tools include the biotype in the transcript_id
         # This only works if the transcript_id contains the biotype as one of the bar-separated fields and is not the
         # best way to filter the transcripts by biotype
-        assert any(
-            "|" in transcript_id for transcript_id in transcript_ids
-        ), "The transcript_id does not contain the biotype."
+        assert any("|" in transcript_id for transcript_id in transcript_ids), (
+            "The transcript_id column does not contain the biotype. Please use the `pytximport.utils.filter_by_biotype`"
+            " function with the `transcript_gene_map` argument instead. This function can be called after the"
+            " `tximport` function using the resulting AnnData object or xarray Dataset."
+        )
 
         transcript_id_fields = [transcript_id.split("|") for transcript_id in transcript_ids]
 
@@ -99,7 +101,7 @@ def filter_by_biotype(
 
     elif isinstance(transcript_data, ad.AnnData):
         # Calculate the total abundance before filtering
-        total_abundance = transcript_data.obsm["abundance"].sum(axis=0)
+        total_abundance = transcript_data.obsm["abundance"].sum(axis=1)
         transcript_data = transcript_data[:, transcript_keep_boolean]
 
     log(
@@ -112,7 +114,7 @@ def filter_by_biotype(
         log(25, "Recalculating the abundance after filtering by biotype.")
 
         if isinstance(transcript_data, ad.AnnData):
-            new_abundance = transcript_data.obsm["abundance"].sum(axis=0)
+            new_abundance = transcript_data.obsm["abundance"].sum(axis=1)
             ratio = total_abundance / new_abundance
             transcript_data.obsm["abundance"] = (transcript_data.obsm["abundance"].T * ratio).T
         elif isinstance(transcript_data, xr.Dataset):
diff --git a/test/data/piscem/res_oarfish_naming.quant b/test/data/piscem/res_oarfish_naming.quant
new file mode 100644
index 0000000..724b3fc
--- /dev/null
+++ b/test/data/piscem/res_oarfish_naming.quant
@@ -0,0 +1,15 @@
+tname	len	num_reads
+ENST00000513300.5	1924	102.328
+ENST00000282507.7	2355	1592.02
+ENST00000504685.5	1476	68.6528
+ENST00000243108.4	1733	343.499
+ENST00000303450.4	1516	664
+ENST00000243082.4	2039	55
+ENST00000303406.4	1524	304.189
+ENST00000303460.4	1936	47
+ENST00000243056.4	2423	42
+ENST00000312492.2	1805	228
+ENST00000040584.5	1889	4295
+ENST00000430889.2	1666	623.628
+ENST00000394331.3	2943	85.6842
+ENST00000243103.3	3335	962
diff --git a/test/test_biotype_filter.py b/test/test_biotype_filter.py
index 6cf932a..53e24fc 100644
--- a/test/test_biotype_filter.py
+++ b/test/test_biotype_filter.py
@@ -4,6 +4,7 @@
 from typing import List
 
 import anndata as ad
+import numpy as np
 import xarray as xr
 
 from pytximport import tximport
@@ -31,22 +32,41 @@ def test_biotype_filter(
                 return_transcript_data=transcript_level,
             )
 
-            id_column = "transcript_id" if transcript_level else "gene_id"
+            for recalculate_abundance in [True, False]:
+                id_column = "transcript_id" if transcript_level else "gene_id"
 
-            result_filtered = filter_by_biotype(
-                result,
-                transcript_gene_map_mouse,
-                biotype_filter=["protein_coding"],
-                id_column=id_column,
-            )
+                result_filtered = filter_by_biotype(
+                    result,
+                    transcript_gene_map_mouse,
+                    biotype_filter=["protein_coding"],
+                    id_column=id_column,
+                    recalculate_abundance=recalculate_abundance,
+                )
+
+                if output_type == "anndata":
+                    assert isinstance(result_filtered, ad.AnnData)
+                    assert len(result_filtered.var_names) > 0, "No genes were retained."
+                    assert len(result_filtered.var_names) < len(result.var_names), "All genes were retained."
+
+                    if recalculate_abundance:
+                        np.testing.assert_allclose(
+                            result_filtered.obsm["abundance"].sum(axis=1),
+                            result.obsm["abundance"].sum(axis=1),
+                            rtol=1e-4,
+                            atol=1,
+                        )
+
+                else:
+                    assert isinstance(result_filtered, xr.Dataset)
+                    assert len(result_filtered.coords[id_column]) > 0, "No genes were retained."
+                    assert len(result_filtered.coords[id_column]) < len(
+                        result.coords[id_column]
+                    ), "All genes were retained."
 
-            if output_type == "anndata":
-                assert isinstance(result_filtered, ad.AnnData)
-                assert len(result_filtered.var_names) > 0, "No genes were retained."
-                assert len(result_filtered.var_names) < len(result.var_names), "All genes were retained."
-            else:
-                assert isinstance(result_filtered, xr.Dataset)
-                assert len(result_filtered.coords[id_column]) > 0, "No genes were retained."
-                assert len(result_filtered.coords[id_column]) < len(
-                    result.coords[id_column]
-                ), "All genes were retained."
+                    if recalculate_abundance:
+                        np.testing.assert_allclose(
+                            result_filtered["abundance"].sum(axis=0),
+                            result["abundance"].sum(axis=0),
+                            rtol=1e-4,
+                            atol=1,
+                        )
diff --git a/test/test_correctness.py b/test/test_correctness.py
index 7809fa4..2f2cf3c 100644
--- a/test/test_correctness.py
+++ b/test/test_correctness.py
@@ -188,7 +188,7 @@ def test_correctness_inferential_replicates(
                 ignore_transcript_version=True,
                 ignore_after_bar=True,
                 output_type="xarray",
-                counts_from_abundance=None,  # type: ignore
+                counts_from_abundance=None,
             )
 
             assert isinstance(result, xr.Dataset), "The result is not an xarray Dataset."
diff --git a/test/test_piscem.py b/test/test_piscem.py
index 228a3b2..69de374 100644
--- a/test/test_piscem.py
+++ b/test/test_piscem.py
@@ -1,14 +1,11 @@
 """Test importing salmon quantification files."""
 
 from pathlib import Path
-from typing import List
 
 import anndata as ad
 import pandas as pd
-import xarray as xr
 
 from pytximport import tximport
-from pytximport.utils import biotype_filters
 
 
 def test_piscem(
@@ -28,7 +25,23 @@ def test_piscem(
         output_type="anndata",
         ignore_transcript_version=True,
         ignore_after_bar=True,
-        counts_from_abundance=None,  # type: ignore
+        counts_from_abundance=None,
+    )
+
+    # Check that the result is an AnnData object
+    assert isinstance(result, ad.AnnData)
+
+    # Check that the counts.data are all positive
+    assert (result.X >= 0).all()
+
+    result = tximport(
+        [piscem_file.parent / "res_oarfish_naming.quant"],
+        "oarfish",
+        transcript_gene_mapping_human,
+        output_type="anndata",
+        ignore_transcript_version=True,
+        ignore_after_bar=True,
+        counts_from_abundance=None,
     )
 
     # Check that the result is an AnnData object
diff --git a/test/test_transcript_level.py b/test/test_transcript_level.py
index 8b347b1..80c561b 100644
--- a/test/test_transcript_level.py
+++ b/test/test_transcript_level.py
@@ -5,6 +5,7 @@
 
 import anndata as ad
 import pandas as pd
+import xarray as xr
 
 from pytximport import tximport
 from pytximport.utils import biotype_filters, replace_transcript_ids_with_names
@@ -22,7 +23,7 @@ def test_salmon_transcript_level(
         transcript_name_mapping_human (pd.DataFrame): The mapping of transcript ids to transcript names.
         transcript_name_mapping_human_path (Path): The path to the transcript name mapping.
     """
-    result = tximport(
+    result_ad = tximport(
         [salmon_file],
         "salmon",
         return_transcript_data=True,
@@ -32,14 +33,14 @@ def test_salmon_transcript_level(
         counts_from_abundance="scaled_tpm",
     )
 
-    assert isinstance(result, ad.AnnData)
-    counts = result.X
+    assert isinstance(result_ad, ad.AnnData)
+    counts = result_ad.X
 
     # Check that the counts.data are all positive
     assert (counts >= 0).all()
 
     # replace the transcript ids with the transcript names
-    result_df = replace_transcript_ids_with_names(result, transcript_name_mapping_human)
+    result_df = replace_transcript_ids_with_names(result_ad, transcript_name_mapping_human)
 
     # Check that the result is an AnnData object
     assert isinstance(result_df, ad.AnnData)
@@ -55,10 +56,24 @@ def test_salmon_transcript_level(
     )
 
     # Check that it also works with a path to the transcript name mapping
-    result_path = replace_transcript_ids_with_names(result, transcript_name_mapping_human_path)
+    result_replaced_ad = replace_transcript_ids_with_names(result_ad, transcript_name_mapping_human_path)
 
     # Check that the results are the same
-    pd.testing.assert_frame_equal(result_df.var_names.to_frame(), result_path.var_names.to_frame())
+    pd.testing.assert_frame_equal(result_df.var_names.to_frame(), result_replaced_ad.var_names.to_frame())
+
+    # Check with an xarray Dataset
+    result_xr = tximport(
+        [salmon_file],
+        "salmon",
+        return_transcript_data=True,
+        output_type="xarray",
+        ignore_transcript_version=True,
+        ignore_after_bar=True,
+        counts_from_abundance="scaled_tpm",
+    )
+
+    result_replaced_xr = replace_transcript_ids_with_names(result_xr, transcript_name_mapping_human)
+    assert isinstance(result_replaced_xr, xr.Dataset)
 
 
 def test_dtu_scaled_tpm(