Skip to content

Commit

Permalink
Merge pull request #98 from motleystate/97-adapt-phylogenetic-tree
Browse files Browse the repository at this point in the history
Adapt Phylogenetic Tree to count dataframe
  • Loading branch information
AgnesBaud authored Feb 3, 2023
2 parents 6f62464 + 63276d4 commit 56db2cf
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: ["3.7", "3.8"]

steps:
- uses: actions/checkout@v2
Expand Down
5 changes: 4 additions & 1 deletion moonstone/analysis/diversity/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,10 @@ def _make_graph(

def _structure_remodelling(self, datastruct: Union[pd.Series, pd.DataFrame], structure: str, sym: bool):
if sym:
datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])])
if isinstance(datastruct, pd.Series):
datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])])
else: # ed. pd.DataFrame
datastruct = datastruct.fillna(datastruct.transpose())
if structure == 'dataframe':
datastruct = datastruct.unstack(level=1)
datastruct.index.name = None
Expand Down
73 changes: 73 additions & 0 deletions moonstone/utils/phylogenetic_tree_editing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Adapt Phylogenetic Tree to counts dataframe"""

import pandas as pd
import re


def generate_translation_dictionary(
new_otu_id_name_ser: pd.Series
):
"""
Args:
- new_otu_id_name_ser: pd.Series issued from kraken 2 count dataframe with only new_otu_id_name column
"""
level = "species" # only works with species for now
new_otu_id_name_ser.index = new_otu_id_name_ser.index.get_level_values(level)
new_otu_id_name_ser = new_otu_id_name_ser[~new_otu_id_name_ser.index.str.contains("(", regex=False)].astype(str)
dic_translate_tree = new_otu_id_name_ser.reset_index().set_index(new_otu_id_name_ser.name).to_dict()[level]
return dic_translate_tree


def replacement(
matchobj,
dic_translate_tree: dict,
quotechr: str
) -> str:
s = matchobj.group(0).split(", ")[-1].rstrip("*"+quotechr)
if s in dic_translate_tree.keys():
return quotechr+dic_translate_tree[s]+quotechr
else:
return matchobj.group(0)


def replacing_labels(
tree_string: str,
dic_translate_tree: dict,
quotechr: str = "'"
):
return re.sub(r"'[^,]*, [0-9]*\*?'", lambda match: replacement(match, dic_translate_tree, quotechr), tree_string)


def adapt_phylogenetic_tree_to_counts_df(
new_otu_id_name_ser: pd.Series,
tree: str,
output_tree_file: str = None,
quotechr: str = "'"
):
"""
Translate phylogenetic tree labels to names present in a counts dataframe using the txid as key
Args:
- new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column
('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3)
- tree: path to the tree file to adapt or tree as a string. The format of the tree leaves labels should be
'{species name}, {txid}' or '{species name}, {txid}*'
- output_tree_file: path to the output adapted tree file.
If None, then function return the adaptated tree as a string
- quotechr: quote character used as delimiter of labels in tree
"""
try:
infile = open(tree, "r")
T = infile.read()
infile.close()
except FileNotFoundError:
T = tree

dic_translate_tree = generate_translation_dictionary(new_otu_id_name_ser)
T = replacing_labels(T, dic_translate_tree, quotechr)

if output_tree_file:
outfile = open(output_tree_file, "w")
outfile.write(T)
outfile.close()
else:
return T
67 changes: 67 additions & 0 deletions tests/utils/test_phylogenetic_tree_editing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pandas as pd

from unittest import TestCase

from moonstone.utils.phylogenetic_tree_editing import (
generate_translation_dictionary,
replacing_labels,
adapt_phylogenetic_tree_to_counts_df
)


class TestPhylogeneticTreeAdaptation(TestCase):
def setUp(self):
self.count_df = pd.DataFrame(
[
['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillales (order)', 'Lactobacillales (order)', 'Lactobacillales (order)', 186826, 4.3], # noqa
['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_jensenii', 109790, 1.0], # noqa
['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners', 147802, 3.5] # noqa
],
columns=[
'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species',
'NCBI_taxonomy_ID', 'SAMPLE_1'
]
)
self.count_df = self.count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])

def test_generate_translation_dictionary(self):
expected_dict = {
'109790': 'Lactobacillus_jensenii',
'147802': 'Lactobacillus_iners',
}
tested_dict = generate_translation_dictionary(self.count_df['NCBI_taxonomy_ID'])
self.assertDictEqual(tested_dict, expected_dict)

def test_replacing_labels(self):
tree_string = "((('Lactobacillus jensenii, 109790':0.35,\
'Lactobacillus iners, 147802':0.15):0.75,\
'Lactobacillus ruminis CAG:367, 1263085*':1):0.5,\
('Prevotella sp. oral taxon 473, 712469':0.5,\
'Enterococcus lactis, 357441':0.05):1)root;\n"
tested_string = replacing_labels(
tree_string,
{
'109790': 'Lactobacillus_jensenii',
'147802': 'Lactobacillus_iners',
'712469': 'Alloprevotella_Prevotella sp. oral taxon 473',
'1263085': 'Lactobacillus_ruminis CAG:367'
}
)
expected_string = "((('Lactobacillus_jensenii':0.35,\
'Lactobacillus_iners':0.15):0.75,\
'Lactobacillus_ruminis CAG:367':1):0.5,\
('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\
'Enterococcus lactis, 357441':0.05):1)root;\n"
self.assertEqual(tested_string, expected_string)

def test_adapt_phylogenetic_tree_to_counts_df(self):
tree_string = "((('Lactobacillus jensenii, 109790':0.35,\
'Lactobacillus iners, 147802':0.15):0.75,\
'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n"
tested_string = adapt_phylogenetic_tree_to_counts_df(
self.count_df['NCBI_taxonomy_ID'], tree_string
)
expected_string = "((('Lactobacillus_jensenii':0.35,\
'Lactobacillus_iners':0.15):0.75,\
'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n"
self.assertEqual(tested_string, expected_string)

0 comments on commit 56db2cf

Please sign in to comment.