Merge version 0.3.0 (#1)

* Start work on 0.3.0, making ignore transcript version and ignore bar configurable from CLI * Change default return type to anndata, rescale abundances when using biotype filters, slightly faster missing transcript length replacements, honor return type when returnin transcript-level counts, run ci on dev branch
complextissue · Jun 4, 2024 · d9b128b · d9b128b
1 parent 6a29a7e
commit d9b128b
Show file tree

Hide file tree

Showing 16 changed files with 145 additions and 101 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@ name: Test & lint package & deploy documentation
 
 on:
   push:
-    branches: [main]
+    branches: [main, dev]
   pull_request:
     branches: [main]
 

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -1,5 +1,5 @@
 {
-    "version": "0.2.0",
+    "version": "0.3.0",
     "configurations": [
         {
             "name": "Python: Current File",

diff --git a/CITATION.cff b/CITATION.cff
@@ -6,6 +6,6 @@ authors:
   - family-names: "Puelles"
     given-names: "Victor"
 title: "pytximport: Fast gene count estimation from transcript quantification files in Python"
-version: 0.2.0
+version: 0.3.0
 date-released: 2024-06-01
 url: "https://github.com/complextissue/pytximport"
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ results = tximport(
 
 Please cite both the original publication as well as this Python implementation:
 - Charlotte Soneson, Michael I. Love, Mark D. Robinson. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences, F1000Research, 4:1521, December 2015. doi: 10.12688/f1000research.7563.1
-- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.2.0) [Computer software]. https://github.com/complextissue/pytximport
+- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.3.0) [Computer software]. https://github.com/complextissue/pytximport
 
 ## License
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -18,7 +18,7 @@
 author = "Malte Kuehl"
 
 # The full version, including alpha/beta/rc tags
-release = "0.2.0"
+release = "0.3.0"
 
 # -- General configuration ---------------------------------------------------
 
@@ -106,7 +106,7 @@
 html_theme = "furo"
 
 html_theme_options = {
-    "announcement": "<a href='https://pypi.org/project/pytximport/'><strong>pytximport</strong></a> 0.2.0 has been released!",
+    "announcement": "<a href='https://pypi.org/project/pytximport/'><strong>pytximport</strong></a> has been released!",
 }
 
 html_title = "pytximport"

diff --git a/docs/source/example.ipynb b/docs/source/example.ipynb
diff --git a/docs/source/start.md b/docs/source/start.md
@@ -61,7 +61,7 @@ The `tximport` package has become a main stay in the bulk RNA sequencing communi
 
 Please cite both the original publication as well as this Python implementation:
 - Charlotte Soneson, Michael I. Love, Mark D. Robinson. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences, F1000Research, 4:1521, December 2015. doi: 10.12688/f1000research.7563.1
-- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.2.0) [Computer software]. https://github.com/complextissue/pytximport
+- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.3.0) [Computer software]. https://github.com/complextissue/pytximport
 
 ## Differences
 

diff --git a/pytximport/_cli.py b/pytximport/_cli.py
@@ -32,6 +32,13 @@
     type=click.Path(exists=True),
     help="The path to the transcript to gene mapping file.",
 )
+@click.option(
+    "-c",
+    "--counts_from_abundance",
+    "--counts-from-abundance",
+    type=click.Choice(["scaled_tpm", "length_scaled_tpm"]),
+    help="The type of counts to convert to.",
+)
 @click.option(
     "-o",
     "--save_path",
@@ -43,24 +50,17 @@
 @click.option(
     "--ignore_after_bar",
     "--ignore-after-bar",
-    is_flag=True,
+    type=bool,
     default=True,
     help="Whether to split the transcript id after the bar character (`|`).",
 )
 @click.option(
     "--ignore_transcript_version",
     "--ignore-transcript-version",
-    is_flag=True,
+    type=bool,
     default=True,
     help="Whether to ignore the transcript version.",
 )
-@click.option(
-    "-c",
-    "--counts_from_abundance",
-    "--counts-from-abundance",
-    type=click.Choice(["scaled_tpm", "length_scaled_tpm"]),
-    help="The type of counts to convert to.",
-)
 @click.option(
     "--return_transcript_data",
     "--return-transcript-data",
@@ -71,24 +71,28 @@
     "-id",
     "--id_column",
     "--id-column",
+    type=str,
     help="The column name for the transcript id.",
 )
 @click.option(
     "-counts",
     "--counts_column",
     "--counts-column",
+    type=str,
     help="The column name for the counts.",
 )
 @click.option(
     "-length",
     "--length_column",
     "--length-column",
+    type=str,
     help="The column name for the length.",
 )
 @click.option(
     "-tpm",
     "--abundance_column",
     "--abundance-column",
+    type=str,
     help="The column name for the abundance.",
 )
 @click.option(

diff --git a/pytximport/_version.py b/pytximport/_version.py
@@ -1,4 +1,4 @@
 """Version information for the pytximport package."""
 
 # This package will follow Semantic Versioning after version 1.0.0: https://semver.org/
-__version__ = "0.2.0"
+__version__ = "0.3.0"
diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py
@@ -37,7 +37,7 @@ def tximport(
     sparse_threshold: Optional[float] = None,
     read_length: Optional[int] = None,
     # arguments exclusive to the pytximport implementation
-    output_type: Literal["xarray", "anndata"] = "xarray",
+    output_type: Literal["xarray", "anndata"] = "anndata",
     output_format: Literal["csv", "h5ad"] = "csv",
     save_path: Optional[Union[str, Path]] = None,
     return_data: bool = True,
@@ -80,7 +80,7 @@ def tximport(
         sparse_threshold (Optional[float], optional): The threshold for the sparse matrix. Currently, sparse input is
             not supported. Defaults to None.
         read_length (Optional[int], optional): The read length for the stringtie quantification. Defaults to None.
-        output_type (Literal["xarray", "anndata"], optional): The type of output. Defaults to "xarray".
+        output_type (Literal["xarray", "anndata"], optional): The type of output. Defaults to "anndata".
         output_format (Literal["csv", "h5ad"], optional): The type of output file. Defaults to "csv".
         save_path (Optional[Union[str, Path]], optional): The path to save the gene-level expression. Defaults to None.
         return_data (bool, optional): Whether to return the gene-level expression. Defaults to True.
@@ -270,6 +270,18 @@ def tximport(
                 counts_from_abundance,
             )
 
+        if output_type == "anndata":
+            # convert to AnnData
+            return ad.AnnData(
+                X=transcript_data["counts"].values.T,
+                obs=pd.DataFrame(index=transcript_data.coords["file_path"].values),
+                var=pd.DataFrame(index=transcript_data.coords["transcript_id"].values),
+                obsm={
+                    "length": transcript_data["length"].values.T,
+                    "abundance": transcript_data["abundance"].values.T,
+                },
+            )
+
         return transcript_data
 
     # convert to gene-level expression

diff --git a/pytximport/utils/_convert_transcripts_to_genes.py b/pytximport/utils/_convert_transcripts_to_genes.py
@@ -57,6 +57,9 @@ def convert_transcripts_to_genes(
         # check that at least one transcript is protein-coding
         assert any(transcript_keep_boolean), "No transcripts with the desired biotype are present in the data."
 
+        # calculate the total abundance before filtering
+        total_abundance = transcript_data["abundance"].sum(axis=0)
+
         transcript_data = transcript_data.isel(
             transcript_id=transcript_keep_boolean,
             drop=True,
@@ -67,6 +70,11 @@ def convert_transcripts_to_genes(
         )
         transcript_ids = transcript_data.coords["transcript_id"].values
 
+        # recalculate the abundance for each sample
+        new_abundance = transcript_data["abundance"].sum(axis=0)
+        ratio = total_abundance / new_abundance
+        transcript_data["abundance"] = (transcript_data["abundance"].T * ratio).T
+
     if ignore_after_bar:
         # ignore the part of the transcript ID after the bar
         transcript_ids = [transcript_id.split("|")[0] for transcript_id in transcript_ids]

diff --git a/pytximport/utils/_replace_missing_average_transcript_length.py b/pytximport/utils/_replace_missing_average_transcript_length.py
@@ -18,6 +18,9 @@ def replace_missing_average_transcript_length(
     # get the rows of the DataArray with missing values
     nan_rows = np.where(length.isnull().any(dim="file") == True)[0]  # noqa: E712
 
+    gene_ids = []
+    lengths = []
+
     for nan_idx in nan_rows:
         row = length.isel({"gene_id": nan_idx})
         gene_id = row.coords["gene_id"].data
@@ -37,6 +40,11 @@ def replace_missing_average_transcript_length(
             average_gene_length = np.exp(np.mean(np.log(length.loc[{"gene_id": gene_id}].data[~column_is_nan])))
 
         # replace the missing row with the average gene length
-        length.loc[{"gene_id": gene_id}] = length.loc[{"gene_id": gene_id}].fillna(average_gene_length)
+        gene_ids.append(gene_id)
+        lengths.append(length.loc[{"gene_id": gene_id}].fillna(average_gene_length))
+
+    # batching updates seems to be faster than updating the DataArray row by row
+    if len(gene_ids) > 0:
+        length.loc[{"gene_id": gene_ids}] = lengths
 
     return length
diff --git a/test/test_comparison.ipynb b/test/test_comparison.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -122,53 +122,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-05-30 21:04:42,063: Starting the import.\n",
-      "Reading quantification files: 4it [00:01,  3.58it/s]\n",
-      "2024-05-30 21:04:43,294: Converting transcript-level expression to gene-level expression.\n",
-      "2024-05-30 21:04:43,671: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
-      "2024-05-30 21:04:43,952: Matching gene_ids.\n",
-      "2024-05-30 21:04:44,116: Creating gene abundance.\n",
-      "2024-05-30 21:04:44,213: Creating gene counts.\n",
-      "2024-05-30 21:04:44,342: Creating lengths.\n",
-      "2024-05-30 21:04:44,495: Replacing missing lengths.\n",
-      "2024-05-30 21:04:50,105: Creating gene expression dataset.\n",
-      "2024-05-30 21:04:50,136: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_no.csv.\n",
-      "2024-05-30 21:04:50,207: Finished the import in 8.14 seconds.\n",
-      "2024-05-30 21:04:51,406: Starting the import.\n",
-      "Reading quantification files: 4it [00:01,  3.63it/s]\n",
-      "2024-05-30 21:04:52,631: Converting transcript-level expression to gene-level expression.\n",
-      "2024-05-30 21:04:53,053: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
-      "2024-05-30 21:04:53,313: Matching gene_ids.\n",
-      "2024-05-30 21:04:53,467: Creating gene abundance.\n",
-      "2024-05-30 21:04:53,561: Creating gene counts.\n",
-      "2024-05-30 21:04:53,663: Creating lengths.\n",
-      "2024-05-30 21:04:53,755: Replacing missing lengths.\n",
-      "2024-05-30 21:04:59,260: Recreating gene counts from abundances.\n",
-      "2024-05-30 21:04:59,260: Setting the counts to scaled TPM.\n",
-      "2024-05-30 21:04:59,261: Creating gene expression dataset.\n",
-      "2024-05-30 21:04:59,290: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_scaledTPM.csv.\n",
-      "2024-05-30 21:04:59,449: Finished the import in 8.04 seconds.\n",
-      "2024-05-30 21:05:00,696: Starting the import.\n",
-      "Reading quantification files: 4it [00:01,  3.45it/s]\n",
-      "2024-05-30 21:05:01,977: Converting transcript-level expression to gene-level expression.\n",
-      "2024-05-30 21:05:02,376: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
-      "2024-05-30 21:05:02,656: Matching gene_ids.\n",
-      "2024-05-30 21:05:02,816: Creating gene abundance.\n",
-      "2024-05-30 21:05:02,910: Creating gene counts.\n",
-      "2024-05-30 21:05:03,002: Creating lengths.\n",
-      "2024-05-30 21:05:03,096: Replacing missing lengths.\n",
-      "2024-05-30 21:05:08,628: Recreating gene counts from abundances.\n",
-      "2024-05-30 21:05:08,628: Setting counts to length scaled TPM.\n",
-      "2024-05-30 21:05:08,631: Creating gene expression dataset.\n",
-      "2024-05-30 21:05:08,658: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_lengthScaledTPM.csv.\n",
-      "2024-05-30 21:05:08,751: Finished the import in 8.06 seconds.\n"
+      "2024-06-04 19:00:50,027: Starting the import.\n",
+      "Reading quantification files: 4it [00:01,  2.84it/s]\n",
+      "2024-06-04 19:00:51,561: Converting transcript-level expression to gene-level expression.\n",
+      "2024-06-04 19:00:51,981: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
+      "2024-06-04 19:00:52,335: Matching gene_ids.\n",
+      "2024-06-04 19:00:52,542: Creating gene abundance.\n",
+      "2024-06-04 19:00:52,820: Creating gene counts.\n",
+      "2024-06-04 19:00:52,936: Creating lengths.\n",
+      "2024-06-04 19:00:53,089: Replacing missing lengths.\n",
+      "2024-06-04 19:00:58,156: Creating gene expression dataset.\n",
+      "2024-06-04 19:00:58,193: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_no.csv.\n",
+      "2024-06-04 19:00:58,273: Finished the import in 8.25 seconds.\n",
+      "2024-06-04 19:00:59,769: Starting the import.\n",
+      "Reading quantification files: 4it [00:01,  2.66it/s]\n",
+      "2024-06-04 19:01:01,409: Converting transcript-level expression to gene-level expression.\n",
+      "2024-06-04 19:01:01,837: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
+      "2024-06-04 19:01:02,160: Matching gene_ids.\n",
+      "2024-06-04 19:01:02,321: Creating gene abundance.\n",
+      "2024-06-04 19:01:02,599: Creating gene counts.\n",
+      "2024-06-04 19:01:02,673: Creating lengths.\n",
+      "2024-06-04 19:01:02,791: Replacing missing lengths.\n",
+      "2024-06-04 19:01:07,705: Recreating gene counts from abundances.\n",
+      "2024-06-04 19:01:07,706: Setting the counts to scaled TPM.\n",
+      "2024-06-04 19:01:07,707: Creating gene expression dataset.\n",
+      "2024-06-04 19:01:07,736: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_scaledTPM.csv.\n",
+      "2024-06-04 19:01:07,827: Finished the import in 8.06 seconds.\n",
+      "2024-06-04 19:01:09,437: Starting the import.\n",
+      "Reading quantification files: 4it [00:01,  3.27it/s]\n",
+      "2024-06-04 19:01:10,794: Converting transcript-level expression to gene-level expression.\n",
+      "2024-06-04 19:01:11,424: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
+      "2024-06-04 19:01:11,870: Matching gene_ids.\n",
+      "2024-06-04 19:01:12,045: Creating gene abundance.\n",
+      "2024-06-04 19:01:12,385: Creating gene counts.\n",
+      "2024-06-04 19:01:12,462: Creating lengths.\n",
+      "2024-06-04 19:01:12,586: Replacing missing lengths.\n",
+      "2024-06-04 19:01:17,438: Recreating gene counts from abundances.\n",
+      "2024-06-04 19:01:17,439: Setting counts to length scaled TPM.\n",
+      "2024-06-04 19:01:17,444: Creating gene expression dataset.\n",
+      "2024-06-04 19:01:17,484: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_lengthScaledTPM.csv.\n",
+      "2024-06-04 19:01:17,586: Finished the import in 8.15 seconds.\n"
      ]
     }
    ],
@@ -180,7 +180,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,6 +201,13 @@
     "pd.testing.assert_frame_equal(counts_tximport_scaledTPM, counts_pytximport_scaledTPM)\n",
     "pd.testing.assert_frame_equal(counts_tximport_lengthScaledTPM, counts_pytximport_lengthScaledTPM)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/test/test_correctness.py b/test/test_correctness.py
@@ -27,6 +27,7 @@ def test_correctness(
             fabry_directory / "transcript_gene_mapping_human.csv",
             ignore_transcript_version=True,
             ignore_after_bar=True,
+            output_type="xarray",
             counts_from_abundance=counts_from_abundance,  # type: ignore
         )