Merge pull request #98 from motleystate/97-adapt-phylogenetic-tree

Adapt Phylogenetic Tree to count dataframe
motleystate · Feb 3, 2023 · 56db2cf · 56db2cf
2 parents 6f62464 + 63276d4
commit 56db2cf
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 2 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: ["3.7", "3.8"]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
@@ -163,7 +163,10 @@ def _make_graph(
 
     def _structure_remodelling(self, datastruct: Union[pd.Series, pd.DataFrame], structure: str, sym: bool):
         if sym:
-            datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])])
+            if isinstance(datastruct, pd.Series):
+                datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])])
+            else:  # ed. pd.DataFrame
+                datastruct = datastruct.fillna(datastruct.transpose())
         if structure == 'dataframe':
             datastruct = datastruct.unstack(level=1)
             datastruct.index.name = None

diff --git a/moonstone/utils/phylogenetic_tree_editing.py b/moonstone/utils/phylogenetic_tree_editing.py
@@ -0,0 +1,73 @@
+"""Adapt Phylogenetic Tree to counts dataframe"""
+
+import pandas as pd
+import re
+
+
+def generate_translation_dictionary(
+    new_otu_id_name_ser: pd.Series
+):
+    """
+    Args:
+        - new_otu_id_name_ser: pd.Series issued from kraken 2 count dataframe with only new_otu_id_name column
+    """
+    level = "species"  # only works with species for now
+    new_otu_id_name_ser.index = new_otu_id_name_ser.index.get_level_values(level)
+    new_otu_id_name_ser = new_otu_id_name_ser[~new_otu_id_name_ser.index.str.contains("(", regex=False)].astype(str)
+    dic_translate_tree = new_otu_id_name_ser.reset_index().set_index(new_otu_id_name_ser.name).to_dict()[level]
+    return dic_translate_tree
+
+
+def replacement(
+    matchobj,
+    dic_translate_tree: dict,
+    quotechr: str
+) -> str:
+    s = matchobj.group(0).split(", ")[-1].rstrip("*"+quotechr)
+    if s in dic_translate_tree.keys():
+        return quotechr+dic_translate_tree[s]+quotechr
+    else:
+        return matchobj.group(0)
+
+
+def replacing_labels(
+    tree_string: str,
+    dic_translate_tree: dict,
+    quotechr: str = "'"
+):
+    return re.sub(r"'[^,]*, [0-9]*\*?'", lambda match: replacement(match, dic_translate_tree, quotechr), tree_string)
+
+
+def adapt_phylogenetic_tree_to_counts_df(
+    new_otu_id_name_ser: pd.Series,
+    tree: str,
+    output_tree_file: str = None,
+    quotechr: str = "'"
+):
+    """
+    Translate phylogenetic tree labels to names present in a counts dataframe using the txid as key
+    Args:
+        - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column
+          ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3)
+        - tree: path to the tree file to adapt or tree as a string. The format of the tree leaves labels should be
+          '{species name}, {txid}' or '{species name}, {txid}*'
+        - output_tree_file: path to the output adapted tree file.
+          If None, then function return the adaptated tree as a string
+        - quotechr: quote character used as delimiter of labels in tree
+    """
+    try:
+        infile = open(tree, "r")
+        T = infile.read()
+        infile.close()
+    except FileNotFoundError:
+        T = tree
+
+    dic_translate_tree = generate_translation_dictionary(new_otu_id_name_ser)
+    T = replacing_labels(T, dic_translate_tree, quotechr)
+
+    if output_tree_file:
+        outfile = open(output_tree_file, "w")
+        outfile.write(T)
+        outfile.close()
+    else:
+        return T
diff --git a/tests/utils/test_phylogenetic_tree_editing.py b/tests/utils/test_phylogenetic_tree_editing.py
@@ -0,0 +1,67 @@
+import pandas as pd
+
+from unittest import TestCase
+
+from moonstone.utils.phylogenetic_tree_editing import (
+    generate_translation_dictionary,
+    replacing_labels,
+    adapt_phylogenetic_tree_to_counts_df
+)
+
+
+class TestPhylogeneticTreeAdaptation(TestCase):
+    def setUp(self):
+        self.count_df = pd.DataFrame(
+            [
+                ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillales (order)', 'Lactobacillales (order)', 'Lactobacillales (order)', 186826, 4.3],  # noqa
+                ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_jensenii', 109790, 1.0],  # noqa
+                ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners', 147802, 3.5]  # noqa
+            ],
+            columns=[
+                'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species',
+                'NCBI_taxonomy_ID', 'SAMPLE_1'
+            ]
+        )
+        self.count_df = self.count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
+
+    def test_generate_translation_dictionary(self):
+        expected_dict = {
+            '109790': 'Lactobacillus_jensenii',
+            '147802': 'Lactobacillus_iners',
+        }
+        tested_dict = generate_translation_dictionary(self.count_df['NCBI_taxonomy_ID'])
+        self.assertDictEqual(tested_dict, expected_dict)
+
+    def test_replacing_labels(self):
+        tree_string = "((('Lactobacillus jensenii, 109790':0.35,\
+'Lactobacillus iners, 147802':0.15):0.75,\
+'Lactobacillus ruminis CAG:367, 1263085*':1):0.5,\
+('Prevotella sp. oral taxon 473, 712469':0.5,\
+'Enterococcus lactis, 357441':0.05):1)root;\n"
+        tested_string = replacing_labels(
+            tree_string,
+            {
+                '109790': 'Lactobacillus_jensenii',
+                '147802': 'Lactobacillus_iners',
+                '712469': 'Alloprevotella_Prevotella sp. oral taxon 473',
+                '1263085': 'Lactobacillus_ruminis CAG:367'
+            }
+        )
+        expected_string = "((('Lactobacillus_jensenii':0.35,\
+'Lactobacillus_iners':0.15):0.75,\
+'Lactobacillus_ruminis CAG:367':1):0.5,\
+('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\
+'Enterococcus lactis, 357441':0.05):1)root;\n"
+        self.assertEqual(tested_string, expected_string)
+
+    def test_adapt_phylogenetic_tree_to_counts_df(self):
+        tree_string = "((('Lactobacillus jensenii, 109790':0.35,\
+'Lactobacillus iners, 147802':0.15):0.75,\
+'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n"
+        tested_string = adapt_phylogenetic_tree_to_counts_df(
+            self.count_df['NCBI_taxonomy_ID'], tree_string
+        )
+        expected_string = "((('Lactobacillus_jensenii':0.35,\
+'Lactobacillus_iners':0.15):0.75,\
+'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n"
+        self.assertEqual(tested_string, expected_string)