Skip to content

Commit

Permalink
Merge version 0.3.0 (#1)
Browse files Browse the repository at this point in the history
* Start work on 0.3.0, making ignore transcript version and ignore bar configurable from CLI

* Change default return type to anndata, rescale abundances when using biotype filters, slightly faster missing transcript length replacements, honor return type when returnin transcript-level counts, run ci on dev branch
  • Loading branch information
maltekuehl authored Jun 4, 2024
1 parent 6a29a7e commit d9b128b
Show file tree
Hide file tree
Showing 16 changed files with 145 additions and 101 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Test & lint package & deploy documentation

on:
push:
branches: [main]
branches: [main, dev]
pull_request:
branches: [main]

Expand Down
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "0.2.0",
"version": "0.3.0",
"configurations": [
{
"name": "Python: Current File",
Expand Down
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ authors:
- family-names: "Puelles"
given-names: "Victor"
title: "pytximport: Fast gene count estimation from transcript quantification files in Python"
version: 0.2.0
version: 0.3.0
date-released: 2024-06-01
url: "https://github.com/complextissue/pytximport"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ results = tximport(

Please cite both the original publication as well as this Python implementation:
- Charlotte Soneson, Michael I. Love, Mark D. Robinson. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences, F1000Research, 4:1521, December 2015. doi: 10.12688/f1000research.7563.1
- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.2.0) [Computer software]. https://github.com/complextissue/pytximport
- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.3.0) [Computer software]. https://github.com/complextissue/pytximport

## License

Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
author = "Malte Kuehl"

# The full version, including alpha/beta/rc tags
release = "0.2.0"
release = "0.3.0"

# -- General configuration ---------------------------------------------------

Expand Down Expand Up @@ -106,7 +106,7 @@
html_theme = "furo"

html_theme_options = {
"announcement": "<a href='https://pypi.org/project/pytximport/'><strong>pytximport</strong></a> 0.2.0 has been released!",
"announcement": "<a href='https://pypi.org/project/pytximport/'><strong>pytximport</strong></a> has been released!",
}

html_title = "pytximport"
Expand Down
65 changes: 33 additions & 32 deletions docs/source/example.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/source/start.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ The `tximport` package has become a main stay in the bulk RNA sequencing communi

Please cite both the original publication as well as this Python implementation:
- Charlotte Soneson, Michael I. Love, Mark D. Robinson. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences, F1000Research, 4:1521, December 2015. doi: 10.12688/f1000research.7563.1
- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.2.0) [Computer software]. https://github.com/complextissue/pytximport
- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.3.0) [Computer software]. https://github.com/complextissue/pytximport

## Differences

Expand Down
22 changes: 13 additions & 9 deletions pytximport/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@
type=click.Path(exists=True),
help="The path to the transcript to gene mapping file.",
)
@click.option(
"-c",
"--counts_from_abundance",
"--counts-from-abundance",
type=click.Choice(["scaled_tpm", "length_scaled_tpm"]),
help="The type of counts to convert to.",
)
@click.option(
"-o",
"--save_path",
Expand All @@ -43,24 +50,17 @@
@click.option(
"--ignore_after_bar",
"--ignore-after-bar",
is_flag=True,
type=bool,
default=True,
help="Whether to split the transcript id after the bar character (`|`).",
)
@click.option(
"--ignore_transcript_version",
"--ignore-transcript-version",
is_flag=True,
type=bool,
default=True,
help="Whether to ignore the transcript version.",
)
@click.option(
"-c",
"--counts_from_abundance",
"--counts-from-abundance",
type=click.Choice(["scaled_tpm", "length_scaled_tpm"]),
help="The type of counts to convert to.",
)
@click.option(
"--return_transcript_data",
"--return-transcript-data",
Expand All @@ -71,24 +71,28 @@
"-id",
"--id_column",
"--id-column",
type=str,
help="The column name for the transcript id.",
)
@click.option(
"-counts",
"--counts_column",
"--counts-column",
type=str,
help="The column name for the counts.",
)
@click.option(
"-length",
"--length_column",
"--length-column",
type=str,
help="The column name for the length.",
)
@click.option(
"-tpm",
"--abundance_column",
"--abundance-column",
type=str,
help="The column name for the abundance.",
)
@click.option(
Expand Down
2 changes: 1 addition & 1 deletion pytximport/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Version information for the pytximport package."""

# This package will follow Semantic Versioning after version 1.0.0: https://semver.org/
__version__ = "0.2.0"
__version__ = "0.3.0"
16 changes: 14 additions & 2 deletions pytximport/core/_tximport.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def tximport(
sparse_threshold: Optional[float] = None,
read_length: Optional[int] = None,
# arguments exclusive to the pytximport implementation
output_type: Literal["xarray", "anndata"] = "xarray",
output_type: Literal["xarray", "anndata"] = "anndata",
output_format: Literal["csv", "h5ad"] = "csv",
save_path: Optional[Union[str, Path]] = None,
return_data: bool = True,
Expand Down Expand Up @@ -80,7 +80,7 @@ def tximport(
sparse_threshold (Optional[float], optional): The threshold for the sparse matrix. Currently, sparse input is
not supported. Defaults to None.
read_length (Optional[int], optional): The read length for the stringtie quantification. Defaults to None.
output_type (Literal["xarray", "anndata"], optional): The type of output. Defaults to "xarray".
output_type (Literal["xarray", "anndata"], optional): The type of output. Defaults to "anndata".
output_format (Literal["csv", "h5ad"], optional): The type of output file. Defaults to "csv".
save_path (Optional[Union[str, Path]], optional): The path to save the gene-level expression. Defaults to None.
return_data (bool, optional): Whether to return the gene-level expression. Defaults to True.
Expand Down Expand Up @@ -270,6 +270,18 @@ def tximport(
counts_from_abundance,
)

if output_type == "anndata":
# convert to AnnData
return ad.AnnData(
X=transcript_data["counts"].values.T,
obs=pd.DataFrame(index=transcript_data.coords["file_path"].values),
var=pd.DataFrame(index=transcript_data.coords["transcript_id"].values),
obsm={
"length": transcript_data["length"].values.T,
"abundance": transcript_data["abundance"].values.T,
},
)

return transcript_data

# convert to gene-level expression
Expand Down
8 changes: 8 additions & 0 deletions pytximport/utils/_convert_transcripts_to_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def convert_transcripts_to_genes(
# check that at least one transcript is protein-coding
assert any(transcript_keep_boolean), "No transcripts with the desired biotype are present in the data."

# calculate the total abundance before filtering
total_abundance = transcript_data["abundance"].sum(axis=0)

transcript_data = transcript_data.isel(
transcript_id=transcript_keep_boolean,
drop=True,
Expand All @@ -67,6 +70,11 @@ def convert_transcripts_to_genes(
)
transcript_ids = transcript_data.coords["transcript_id"].values

# recalculate the abundance for each sample
new_abundance = transcript_data["abundance"].sum(axis=0)
ratio = total_abundance / new_abundance
transcript_data["abundance"] = (transcript_data["abundance"].T * ratio).T

if ignore_after_bar:
# ignore the part of the transcript ID after the bar
transcript_ids = [transcript_id.split("|")[0] for transcript_id in transcript_ids]
Expand Down
10 changes: 9 additions & 1 deletion pytximport/utils/_replace_missing_average_transcript_length.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def replace_missing_average_transcript_length(
# get the rows of the DataArray with missing values
nan_rows = np.where(length.isnull().any(dim="file") == True)[0] # noqa: E712

gene_ids = []
lengths = []

for nan_idx in nan_rows:
row = length.isel({"gene_id": nan_idx})
gene_id = row.coords["gene_id"].data
Expand All @@ -37,6 +40,11 @@ def replace_missing_average_transcript_length(
average_gene_length = np.exp(np.mean(np.log(length.loc[{"gene_id": gene_id}].data[~column_is_nan])))

# replace the missing row with the average gene length
length.loc[{"gene_id": gene_id}] = length.loc[{"gene_id": gene_id}].fillna(average_gene_length)
gene_ids.append(gene_id)
lengths.append(length.loc[{"gene_id": gene_id}].fillna(average_gene_length))

# batching updates seems to be faster than updating the DataArray row by row
if len(gene_ids) > 0:
length.loc[{"gene_id": gene_ids}] = lengths

return length
97 changes: 52 additions & 45 deletions test/test_comparison.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -43,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -122,53 +122,53 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024-05-30 21:04:42,063: Starting the import.\n",
"Reading quantification files: 4it [00:01, 3.58it/s]\n",
"2024-05-30 21:04:43,294: Converting transcript-level expression to gene-level expression.\n",
"2024-05-30 21:04:43,671: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
"2024-05-30 21:04:43,952: Matching gene_ids.\n",
"2024-05-30 21:04:44,116: Creating gene abundance.\n",
"2024-05-30 21:04:44,213: Creating gene counts.\n",
"2024-05-30 21:04:44,342: Creating lengths.\n",
"2024-05-30 21:04:44,495: Replacing missing lengths.\n",
"2024-05-30 21:04:50,105: Creating gene expression dataset.\n",
"2024-05-30 21:04:50,136: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_no.csv.\n",
"2024-05-30 21:04:50,207: Finished the import in 8.14 seconds.\n",
"2024-05-30 21:04:51,406: Starting the import.\n",
"Reading quantification files: 4it [00:01, 3.63it/s]\n",
"2024-05-30 21:04:52,631: Converting transcript-level expression to gene-level expression.\n",
"2024-05-30 21:04:53,053: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
"2024-05-30 21:04:53,313: Matching gene_ids.\n",
"2024-05-30 21:04:53,467: Creating gene abundance.\n",
"2024-05-30 21:04:53,561: Creating gene counts.\n",
"2024-05-30 21:04:53,663: Creating lengths.\n",
"2024-05-30 21:04:53,755: Replacing missing lengths.\n",
"2024-05-30 21:04:59,260: Recreating gene counts from abundances.\n",
"2024-05-30 21:04:59,260: Setting the counts to scaled TPM.\n",
"2024-05-30 21:04:59,261: Creating gene expression dataset.\n",
"2024-05-30 21:04:59,290: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_scaledTPM.csv.\n",
"2024-05-30 21:04:59,449: Finished the import in 8.04 seconds.\n",
"2024-05-30 21:05:00,696: Starting the import.\n",
"Reading quantification files: 4it [00:01, 3.45it/s]\n",
"2024-05-30 21:05:01,977: Converting transcript-level expression to gene-level expression.\n",
"2024-05-30 21:05:02,376: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
"2024-05-30 21:05:02,656: Matching gene_ids.\n",
"2024-05-30 21:05:02,816: Creating gene abundance.\n",
"2024-05-30 21:05:02,910: Creating gene counts.\n",
"2024-05-30 21:05:03,002: Creating lengths.\n",
"2024-05-30 21:05:03,096: Replacing missing lengths.\n",
"2024-05-30 21:05:08,628: Recreating gene counts from abundances.\n",
"2024-05-30 21:05:08,628: Setting counts to length scaled TPM.\n",
"2024-05-30 21:05:08,631: Creating gene expression dataset.\n",
"2024-05-30 21:05:08,658: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_lengthScaledTPM.csv.\n",
"2024-05-30 21:05:08,751: Finished the import in 8.06 seconds.\n"
"2024-06-04 19:00:50,027: Starting the import.\n",
"Reading quantification files: 4it [00:01, 2.84it/s]\n",
"2024-06-04 19:00:51,561: Converting transcript-level expression to gene-level expression.\n",
"2024-06-04 19:00:51,981: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
"2024-06-04 19:00:52,335: Matching gene_ids.\n",
"2024-06-04 19:00:52,542: Creating gene abundance.\n",
"2024-06-04 19:00:52,820: Creating gene counts.\n",
"2024-06-04 19:00:52,936: Creating lengths.\n",
"2024-06-04 19:00:53,089: Replacing missing lengths.\n",
"2024-06-04 19:00:58,156: Creating gene expression dataset.\n",
"2024-06-04 19:00:58,193: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_no.csv.\n",
"2024-06-04 19:00:58,273: Finished the import in 8.25 seconds.\n",
"2024-06-04 19:00:59,769: Starting the import.\n",
"Reading quantification files: 4it [00:01, 2.66it/s]\n",
"2024-06-04 19:01:01,409: Converting transcript-level expression to gene-level expression.\n",
"2024-06-04 19:01:01,837: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
"2024-06-04 19:01:02,160: Matching gene_ids.\n",
"2024-06-04 19:01:02,321: Creating gene abundance.\n",
"2024-06-04 19:01:02,599: Creating gene counts.\n",
"2024-06-04 19:01:02,673: Creating lengths.\n",
"2024-06-04 19:01:02,791: Replacing missing lengths.\n",
"2024-06-04 19:01:07,705: Recreating gene counts from abundances.\n",
"2024-06-04 19:01:07,706: Setting the counts to scaled TPM.\n",
"2024-06-04 19:01:07,707: Creating gene expression dataset.\n",
"2024-06-04 19:01:07,736: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_scaledTPM.csv.\n",
"2024-06-04 19:01:07,827: Finished the import in 8.06 seconds.\n",
"2024-06-04 19:01:09,437: Starting the import.\n",
"Reading quantification files: 4it [00:01, 3.27it/s]\n",
"2024-06-04 19:01:10,794: Converting transcript-level expression to gene-level expression.\n",
"2024-06-04 19:01:11,424: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
"2024-06-04 19:01:11,870: Matching gene_ids.\n",
"2024-06-04 19:01:12,045: Creating gene abundance.\n",
"2024-06-04 19:01:12,385: Creating gene counts.\n",
"2024-06-04 19:01:12,462: Creating lengths.\n",
"2024-06-04 19:01:12,586: Replacing missing lengths.\n",
"2024-06-04 19:01:17,438: Recreating gene counts from abundances.\n",
"2024-06-04 19:01:17,439: Setting counts to length scaled TPM.\n",
"2024-06-04 19:01:17,444: Creating gene expression dataset.\n",
"2024-06-04 19:01:17,484: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_lengthScaledTPM.csv.\n",
"2024-06-04 19:01:17,586: Finished the import in 8.15 seconds.\n"
]
}
],
Expand All @@ -180,7 +180,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -201,6 +201,13 @@
"pd.testing.assert_frame_equal(counts_tximport_scaledTPM, counts_pytximport_scaledTPM)\n",
"pd.testing.assert_frame_equal(counts_tximport_lengthScaledTPM, counts_pytximport_lengthScaledTPM)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
1 change: 1 addition & 0 deletions test/test_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def test_correctness(
fabry_directory / "transcript_gene_mapping_human.csv",
ignore_transcript_version=True,
ignore_after_bar=True,
output_type="xarray",
counts_from_abundance=counts_from_abundance, # type: ignore
)

Expand Down
Loading

0 comments on commit d9b128b

Please sign in to comment.