From 0d48524f9d18954c5f5476d02ed49acdd2253755 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Thu, 2 Feb 2023 14:33:58 +0100
Subject: [PATCH 1/6] little functions in utils + tests

---
 moonstone/analysis/diversity/base.py          |  5 +-
 moonstone/utils/phylogenetic_tree_editing.py  | 62 +++++++++++++++++++
 tests/utils/test_phylogenetic_tree_editing.py | 53 ++++++++++++++++
 3 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 moonstone/utils/phylogenetic_tree_editing.py
 create mode 100644 tests/utils/test_phylogenetic_tree_editing.py

diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index 992aa92b..97b19f04 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -163,7 +163,10 @@ def _make_graph(
 
     def _structure_remodelling(self, datastruct: Union[pd.Series, pd.DataFrame], structure: str, sym: bool):
         if sym:
-            datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])])
+            if isinstance(datastruct, pd.Series):
+                datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])])
+            else:  # ed. pd.DataFrame
+                datastruct = datastruct.fillna(datastruct.transpose())            
         if structure == 'dataframe':
             datastruct = datastruct.unstack(level=1)
             datastruct.index.name = None
diff --git a/moonstone/utils/phylogenetic_tree_editing.py b/moonstone/utils/phylogenetic_tree_editing.py
new file mode 100644
index 00000000..93b3ae9d
--- /dev/null
+++ b/moonstone/utils/phylogenetic_tree_editing.py
@@ -0,0 +1,62 @@
+"""Adapt Phylogenetic Tree to counts dataframe"""
+
+import pandas as pd
+import re
+from typing import Union
+
+
+def generate_translation_dictionary(
+    new_otu_id_name_ser: pd.Series,
+    ):
+    """
+    Args:
+        - new_otu_id_name_ser: pd.Series issued from kraken 2 count dataframe with only new_otu_id_name column
+    """
+    level = "species"  # only works with species for now
+    new_otu_id_name_ser.index = new_otu_id_name_ser.index.get_level_values(level)
+    new_otu_id_name_ser = new_otu_id_name_ser[~new_otu_id_name_ser.index.str.contains("(", regex=False)].astype(str)
+    dic_translate_tree = new_otu_id_name_ser.reset_index().set_index(new_otu_id_name_ser.name).to_dict()[level]
+    return dic_translate_tree
+
+
+def replacement(
+    matchobj, 
+    dic_translate_tree: dict, 
+    quotechr: str
+    ) -> str:
+    s = matchobj.group(0).split(", ")[-1].rstrip("*"+quotechr)
+    if s in dic_translate_tree.keys():
+        return quotechr+dic_translate_tree[s]+quotechr
+    else: 
+        return matchobj.group(0)
+
+def replacing_labels(
+    tree_string: str,
+    dic_translate_tree: dict,
+    quotechr = "'"
+):
+    return re.sub("'[^,]*, [0-9]*\*?'", lambda  match: replacement(match, dic_translate_tree, quotechr), tree_string)
+
+
+def adapt_phylogenetic_tree_to_counts_df(
+    new_otu_id_name_ser: pd.Series,
+    tree_file: str,
+    output_tree_file: str,
+    quotechr = "'"
+    ):
+    """
+    Translate phylogenetic tree labels to names present in a counts dataframe using the txid as key
+    Args:
+        - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3)
+        - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be '{species name}, {txid}' or '{species name}, {txid}*'
+        - output_tree_file: path to the output adapted tree file
+        - quotechr: quote character used as delimiter of labels in tree
+    """
+    dic_translate_tree = generate_translation_dictionary(new_otu_id_name_ser)
+    infile = open(tree_file, "r")
+    T = infile.read()
+    infile.close()
+
+    outfile = open(output_tree_file, "w")
+    outfile.write(replacing_labels(T, dic_translate_tree, quotechr))
+    outfile.close()
\ No newline at end of file
diff --git a/tests/utils/test_phylogenetic_tree_editing.py b/tests/utils/test_phylogenetic_tree_editing.py
new file mode 100644
index 00000000..9fc179e7
--- /dev/null
+++ b/tests/utils/test_phylogenetic_tree_editing.py
@@ -0,0 +1,53 @@
+from cgi import test
+import pandas as pd
+
+from unittest import TestCase
+
+from moonstone.utils.phylogenetic_tree_editing import (
+    generate_translation_dictionary,
+    replacing_labels
+)
+
+
+class TestPhylogeneticTreeAdaptation(TestCase):
+    def test_generate_translation_dictionary(self):
+        count_df = pd.DataFrame(
+            [
+                ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillales (order)', 'Lactobacillales (order)', 'Lactobacillales (order)', 186826, 4.3],  # noqa
+                ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_jensenii', 109790, 1.0],  # noqa
+                ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners', 147802, 3.5]  # noqa
+            ],
+            columns=[
+                'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species',
+                'NCBI_taxonomy_ID', 'SAMPLE_1'
+            ]
+        )
+        count_df = count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
+        expected_dict = {
+            '109790': 'Lactobacillus_jensenii',
+            '147802': 'Lactobacillus_iners',
+        }
+        tested_dict = generate_translation_dictionary(count_df['NCBI_taxonomy_ID'])
+        self.assertDictEqual(tested_dict, expected_dict)
+
+    def test_replacing_labels(self):
+        tree_string = "((('Lactobacillus jensenii, 109790':0.35,\
+'Lactobacillus iners, 147802':0.15):0.75,\
+'Lactobacillus ruminis CAG:367, 1263085*':1):0.5,\
+('Prevotella sp. oral taxon 473, 712469':0.5,\
+'Enterococcus lactis, 357441':0.05):1)root;\n"
+        tested_string = replacing_labels(
+            tree_string, 
+            {
+                '109790': 'Lactobacillus_jensenii', 
+                '147802': 'Lactobacillus_iners', 
+                '712469': 'Alloprevotella_Prevotella sp. oral taxon 473', 
+                '1263085': 'Lactobacillus_ruminis CAG:367'
+            }
+        )
+        expected_string = "((('Lactobacillus_jensenii':0.35,\
+'Lactobacillus_iners':0.15):0.75,\
+'Lactobacillus_ruminis CAG:367':1):0.5,\
+('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\
+'Enterococcus lactis, 357441':0.05):1)root;\n"
+        self.assertEqual(tested_string, expected_string)
\ No newline at end of file

From 934d0f92fcb04d39cc090a2114b2f4e73580b7b8 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Thu, 2 Feb 2023 14:50:59 +0100
Subject: [PATCH 2/6] flake8 corrections

---
 moonstone/analysis/diversity/base.py          |  2 +-
 moonstone/utils/phylogenetic_tree_editing.py  | 30 ++++++++++---------
 tests/utils/test_phylogenetic_tree_editing.py | 11 ++++---
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index 97b19f04..344da724 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -166,7 +166,7 @@ def _structure_remodelling(self, datastruct: Union[pd.Series, pd.DataFrame], str
             if isinstance(datastruct, pd.Series):
                 datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])])
             else:  # ed. pd.DataFrame
-                datastruct = datastruct.fillna(datastruct.transpose())            
+                datastruct = datastruct.fillna(datastruct.transpose())
         if structure == 'dataframe':
             datastruct = datastruct.unstack(level=1)
             datastruct.index.name = None
diff --git a/moonstone/utils/phylogenetic_tree_editing.py b/moonstone/utils/phylogenetic_tree_editing.py
index 93b3ae9d..59f816af 100644
--- a/moonstone/utils/phylogenetic_tree_editing.py
+++ b/moonstone/utils/phylogenetic_tree_editing.py
@@ -2,12 +2,11 @@
 
 import pandas as pd
 import re
-from typing import Union
 
 
 def generate_translation_dictionary(
-    new_otu_id_name_ser: pd.Series,
-    ):
+    new_otu_id_name_ser: pd.Series
+):
     """
     Args:
         - new_otu_id_name_ser: pd.Series issued from kraken 2 count dataframe with only new_otu_id_name column
@@ -20,35 +19,38 @@ def generate_translation_dictionary(
 
 
 def replacement(
-    matchobj, 
-    dic_translate_tree: dict, 
+    matchobj,
+    dic_translate_tree: dict,
     quotechr: str
-    ) -> str:
+) -> str:
     s = matchobj.group(0).split(", ")[-1].rstrip("*"+quotechr)
     if s in dic_translate_tree.keys():
         return quotechr+dic_translate_tree[s]+quotechr
-    else: 
+    else:
         return matchobj.group(0)
 
+
 def replacing_labels(
     tree_string: str,
     dic_translate_tree: dict,
-    quotechr = "'"
+    quotechr: str = "'"
 ):
-    return re.sub("'[^,]*, [0-9]*\*?'", lambda  match: replacement(match, dic_translate_tree, quotechr), tree_string)
+    return re.sub(r"'[^,]*, [0-9]*\*?'", lambda match: replacement(match, dic_translate_tree, quotechr), tree_string)
 
 
 def adapt_phylogenetic_tree_to_counts_df(
     new_otu_id_name_ser: pd.Series,
     tree_file: str,
     output_tree_file: str,
-    quotechr = "'"
-    ):
+    quotechr: str = "'"
+):
     """
     Translate phylogenetic tree labels to names present in a counts dataframe using the txid as key
     Args:
-        - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3)
-        - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be '{species name}, {txid}' or '{species name}, {txid}*'
+        - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column
+          ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3)
+        - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be
+          '{species name}, {txid}' or '{species name}, {txid}*'
         - output_tree_file: path to the output adapted tree file
         - quotechr: quote character used as delimiter of labels in tree
     """
@@ -59,4 +61,4 @@ def adapt_phylogenetic_tree_to_counts_df(
 
     outfile = open(output_tree_file, "w")
     outfile.write(replacing_labels(T, dic_translate_tree, quotechr))
-    outfile.close()
\ No newline at end of file
+    outfile.close()
diff --git a/tests/utils/test_phylogenetic_tree_editing.py b/tests/utils/test_phylogenetic_tree_editing.py
index 9fc179e7..39122303 100644
--- a/tests/utils/test_phylogenetic_tree_editing.py
+++ b/tests/utils/test_phylogenetic_tree_editing.py
@@ -1,4 +1,3 @@
-from cgi import test
 import pandas as pd
 
 from unittest import TestCase
@@ -37,11 +36,11 @@ def test_replacing_labels(self):
 ('Prevotella sp. oral taxon 473, 712469':0.5,\
 'Enterococcus lactis, 357441':0.05):1)root;\n"
         tested_string = replacing_labels(
-            tree_string, 
+            tree_string,
             {
-                '109790': 'Lactobacillus_jensenii', 
-                '147802': 'Lactobacillus_iners', 
-                '712469': 'Alloprevotella_Prevotella sp. oral taxon 473', 
+                '109790': 'Lactobacillus_jensenii',
+                '147802': 'Lactobacillus_iners',
+                '712469': 'Alloprevotella_Prevotella sp. oral taxon 473',
                 '1263085': 'Lactobacillus_ruminis CAG:367'
             }
         )
@@ -50,4 +49,4 @@ def test_replacing_labels(self):
 'Lactobacillus_ruminis CAG:367':1):0.5,\
 ('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\
 'Enterococcus lactis, 357441':0.05):1)root;\n"
-        self.assertEqual(tested_string, expected_string)
\ No newline at end of file
+        self.assertEqual(tested_string, expected_string)

From ec34836941f1c2adf19bfbbf58a09dd3006d697a Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Thu, 2 Feb 2023 15:03:18 +0100
Subject: [PATCH 3/6] removing python 3.6 from setup and adding 3.9/3.10

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 4b3976b4..a7f7303f 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8, 3.9, 3.10]
 
     steps:
     - uses: actions/checkout@v2

From 85bc534af5d9e8f1dd3e497a1293dce9dee31319 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Thu, 2 Feb 2023 15:05:08 +0100
Subject: [PATCH 4/6] better with quote

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index a7f7303f..93807e5d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9, 3.10]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
 
     steps:
     - uses: actions/checkout@v2

From fec9a50d6d8b55e8ad5046e480a30eba972691e0 Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Thu, 2 Feb 2023 17:07:34 +0100
Subject: [PATCH 5/6] mission aborted

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 93807e5d..b5b393df 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.7", "3.8"]
 
     steps:
     - uses: actions/checkout@v2

From 63276d46da79e8d0e9b770baf9cb4b2b986bd67d Mon Sep 17 00:00:00 2001
From: Agnes BAUD <agnes.baud@pasteur.fr>
Date: Thu, 2 Feb 2023 18:11:16 +0100
Subject: [PATCH 6/6] tree as string can be given and returned

---
 moonstone/utils/phylogenetic_tree_editing.py  | 29 ++++++++++++-------
 tests/utils/test_phylogenetic_tree_editing.py | 25 ++++++++++++----
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/moonstone/utils/phylogenetic_tree_editing.py b/moonstone/utils/phylogenetic_tree_editing.py
index 59f816af..208cc072 100644
--- a/moonstone/utils/phylogenetic_tree_editing.py
+++ b/moonstone/utils/phylogenetic_tree_editing.py
@@ -40,8 +40,8 @@ def replacing_labels(
 
 def adapt_phylogenetic_tree_to_counts_df(
     new_otu_id_name_ser: pd.Series,
-    tree_file: str,
-    output_tree_file: str,
+    tree: str,
+    output_tree_file: str = None,
     quotechr: str = "'"
 ):
     """
@@ -49,16 +49,25 @@ def adapt_phylogenetic_tree_to_counts_df(
     Args:
         - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column
           ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3)
-        - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be
+        - tree: path to the tree file to adapt or tree as a string. The format of the tree leaves labels should be
           '{species name}, {txid}' or '{species name}, {txid}*'
-        - output_tree_file: path to the output adapted tree file
+        - output_tree_file: path to the output adapted tree file.
+          If None, then function return the adaptated tree as a string
         - quotechr: quote character used as delimiter of labels in tree
     """
+    try:
+        infile = open(tree, "r")
+        T = infile.read()
+        infile.close()
+    except FileNotFoundError:
+        T = tree
+
     dic_translate_tree = generate_translation_dictionary(new_otu_id_name_ser)
-    infile = open(tree_file, "r")
-    T = infile.read()
-    infile.close()
+    T = replacing_labels(T, dic_translate_tree, quotechr)
 
-    outfile = open(output_tree_file, "w")
-    outfile.write(replacing_labels(T, dic_translate_tree, quotechr))
-    outfile.close()
+    if output_tree_file:
+        outfile = open(output_tree_file, "w")
+        outfile.write(T)
+        outfile.close()
+    else:
+        return T
diff --git a/tests/utils/test_phylogenetic_tree_editing.py b/tests/utils/test_phylogenetic_tree_editing.py
index 39122303..bdede51e 100644
--- a/tests/utils/test_phylogenetic_tree_editing.py
+++ b/tests/utils/test_phylogenetic_tree_editing.py
@@ -4,13 +4,14 @@
 
 from moonstone.utils.phylogenetic_tree_editing import (
     generate_translation_dictionary,
-    replacing_labels
+    replacing_labels,
+    adapt_phylogenetic_tree_to_counts_df
 )
 
 
 class TestPhylogeneticTreeAdaptation(TestCase):
-    def test_generate_translation_dictionary(self):
-        count_df = pd.DataFrame(
+    def setUp(self):
+        self.count_df = pd.DataFrame(
             [
                 ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillales (order)', 'Lactobacillales (order)', 'Lactobacillales (order)', 186826, 4.3],  # noqa
                 ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_jensenii', 109790, 1.0],  # noqa
@@ -21,12 +22,14 @@ def test_generate_translation_dictionary(self):
                 'NCBI_taxonomy_ID', 'SAMPLE_1'
             ]
         )
-        count_df = count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
+        self.count_df = self.count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
+
+    def test_generate_translation_dictionary(self):
         expected_dict = {
             '109790': 'Lactobacillus_jensenii',
             '147802': 'Lactobacillus_iners',
         }
-        tested_dict = generate_translation_dictionary(count_df['NCBI_taxonomy_ID'])
+        tested_dict = generate_translation_dictionary(self.count_df['NCBI_taxonomy_ID'])
         self.assertDictEqual(tested_dict, expected_dict)
 
     def test_replacing_labels(self):
@@ -50,3 +53,15 @@ def test_replacing_labels(self):
 ('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\
 'Enterococcus lactis, 357441':0.05):1)root;\n"
         self.assertEqual(tested_string, expected_string)
+
+    def test_adapt_phylogenetic_tree_to_counts_df(self):
+        tree_string = "((('Lactobacillus jensenii, 109790':0.35,\
+'Lactobacillus iners, 147802':0.15):0.75,\
+'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n"
+        tested_string = adapt_phylogenetic_tree_to_counts_df(
+            self.count_df['NCBI_taxonomy_ID'], tree_string
+        )
+        expected_string = "((('Lactobacillus_jensenii':0.35,\
+'Lactobacillus_iners':0.15):0.75,\
+'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n"
+        self.assertEqual(tested_string, expected_string)