From 0d48524f9d18954c5f5476d02ed49acdd2253755 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Thu, 2 Feb 2023 14:33:58 +0100 Subject: [PATCH 1/6] little functions in utils + tests --- moonstone/analysis/diversity/base.py | 5 +- moonstone/utils/phylogenetic_tree_editing.py | 62 +++++++++++++++++++ tests/utils/test_phylogenetic_tree_editing.py | 53 ++++++++++++++++ 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 moonstone/utils/phylogenetic_tree_editing.py create mode 100644 tests/utils/test_phylogenetic_tree_editing.py diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index 992aa92b..97b19f04 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -163,7 +163,10 @@ def _make_graph( def _structure_remodelling(self, datastruct: Union[pd.Series, pd.DataFrame], structure: str, sym: bool): if sym: - datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])]) + if isinstance(datastruct, pd.Series): + datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])]) + else: # ed. pd.DataFrame + datastruct = datastruct.fillna(datastruct.transpose()) if structure == 'dataframe': datastruct = datastruct.unstack(level=1) datastruct.index.name = None diff --git a/moonstone/utils/phylogenetic_tree_editing.py b/moonstone/utils/phylogenetic_tree_editing.py new file mode 100644 index 00000000..93b3ae9d --- /dev/null +++ b/moonstone/utils/phylogenetic_tree_editing.py @@ -0,0 +1,62 @@ +"""Adapt Phylogenetic Tree to counts dataframe""" + +import pandas as pd +import re +from typing import Union + + +def generate_translation_dictionary( + new_otu_id_name_ser: pd.Series, + ): + """ + Args: + - new_otu_id_name_ser: pd.Series issued from kraken 2 count dataframe with only new_otu_id_name column + """ + level = "species" # only works with species for now + new_otu_id_name_ser.index = new_otu_id_name_ser.index.get_level_values(level) + new_otu_id_name_ser = new_otu_id_name_ser[~new_otu_id_name_ser.index.str.contains("(", regex=False)].astype(str) + dic_translate_tree = new_otu_id_name_ser.reset_index().set_index(new_otu_id_name_ser.name).to_dict()[level] + return dic_translate_tree + + +def replacement( + matchobj, + dic_translate_tree: dict, + quotechr: str + ) -> str: + s = matchobj.group(0).split(", ")[-1].rstrip("*"+quotechr) + if s in dic_translate_tree.keys(): + return quotechr+dic_translate_tree[s]+quotechr + else: + return matchobj.group(0) + +def replacing_labels( + tree_string: str, + dic_translate_tree: dict, + quotechr = "'" +): + return re.sub("'[^,]*, [0-9]*\*?'", lambda match: replacement(match, dic_translate_tree, quotechr), tree_string) + + +def adapt_phylogenetic_tree_to_counts_df( + new_otu_id_name_ser: pd.Series, + tree_file: str, + output_tree_file: str, + quotechr = "'" + ): + """ + Translate phylogenetic tree labels to names present in a counts dataframe using the txid as key + Args: + - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3) + - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be '{species name}, {txid}' or '{species name}, {txid}*' + - output_tree_file: path to the output adapted tree file + - quotechr: quote character used as delimiter of labels in tree + """ + dic_translate_tree = generate_translation_dictionary(new_otu_id_name_ser) + infile = open(tree_file, "r") + T = infile.read() + infile.close() + + outfile = open(output_tree_file, "w") + outfile.write(replacing_labels(T, dic_translate_tree, quotechr)) + outfile.close() \ No newline at end of file diff --git a/tests/utils/test_phylogenetic_tree_editing.py b/tests/utils/test_phylogenetic_tree_editing.py new file mode 100644 index 00000000..9fc179e7 --- /dev/null +++ b/tests/utils/test_phylogenetic_tree_editing.py @@ -0,0 +1,53 @@ +from cgi import test +import pandas as pd + +from unittest import TestCase + +from moonstone.utils.phylogenetic_tree_editing import ( + generate_translation_dictionary, + replacing_labels +) + + +class TestPhylogeneticTreeAdaptation(TestCase): + def test_generate_translation_dictionary(self): + count_df = pd.DataFrame( + [ + ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillales (order)', 'Lactobacillales (order)', 'Lactobacillales (order)', 186826, 4.3], # noqa + ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_jensenii', 109790, 1.0], # noqa + ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners', 147802, 3.5] # noqa + ], + columns=[ + 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', + 'NCBI_taxonomy_ID', 'SAMPLE_1' + ] + ) + count_df = count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']) + expected_dict = { + '109790': 'Lactobacillus_jensenii', + '147802': 'Lactobacillus_iners', + } + tested_dict = generate_translation_dictionary(count_df['NCBI_taxonomy_ID']) + self.assertDictEqual(tested_dict, expected_dict) + + def test_replacing_labels(self): + tree_string = "((('Lactobacillus jensenii, 109790':0.35,\ +'Lactobacillus iners, 147802':0.15):0.75,\ +'Lactobacillus ruminis CAG:367, 1263085*':1):0.5,\ +('Prevotella sp. oral taxon 473, 712469':0.5,\ +'Enterococcus lactis, 357441':0.05):1)root;\n" + tested_string = replacing_labels( + tree_string, + { + '109790': 'Lactobacillus_jensenii', + '147802': 'Lactobacillus_iners', + '712469': 'Alloprevotella_Prevotella sp. oral taxon 473', + '1263085': 'Lactobacillus_ruminis CAG:367' + } + ) + expected_string = "((('Lactobacillus_jensenii':0.35,\ +'Lactobacillus_iners':0.15):0.75,\ +'Lactobacillus_ruminis CAG:367':1):0.5,\ +('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\ +'Enterococcus lactis, 357441':0.05):1)root;\n" + self.assertEqual(tested_string, expected_string) \ No newline at end of file From 934d0f92fcb04d39cc090a2114b2f4e73580b7b8 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Thu, 2 Feb 2023 14:50:59 +0100 Subject: [PATCH 2/6] flake8 corrections --- moonstone/analysis/diversity/base.py | 2 +- moonstone/utils/phylogenetic_tree_editing.py | 30 ++++++++++--------- tests/utils/test_phylogenetic_tree_editing.py | 11 ++++--- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py index 97b19f04..344da724 100644 --- a/moonstone/analysis/diversity/base.py +++ b/moonstone/analysis/diversity/base.py @@ -166,7 +166,7 @@ def _structure_remodelling(self, datastruct: Union[pd.Series, pd.DataFrame], str if isinstance(datastruct, pd.Series): datastruct = pd.concat([datastruct, datastruct.reorder_levels([1, 0])]) else: # ed. pd.DataFrame - datastruct = datastruct.fillna(datastruct.transpose()) + datastruct = datastruct.fillna(datastruct.transpose()) if structure == 'dataframe': datastruct = datastruct.unstack(level=1) datastruct.index.name = None diff --git a/moonstone/utils/phylogenetic_tree_editing.py b/moonstone/utils/phylogenetic_tree_editing.py index 93b3ae9d..59f816af 100644 --- a/moonstone/utils/phylogenetic_tree_editing.py +++ b/moonstone/utils/phylogenetic_tree_editing.py @@ -2,12 +2,11 @@ import pandas as pd import re -from typing import Union def generate_translation_dictionary( - new_otu_id_name_ser: pd.Series, - ): + new_otu_id_name_ser: pd.Series +): """ Args: - new_otu_id_name_ser: pd.Series issued from kraken 2 count dataframe with only new_otu_id_name column @@ -20,35 +19,38 @@ def generate_translation_dictionary( def replacement( - matchobj, - dic_translate_tree: dict, + matchobj, + dic_translate_tree: dict, quotechr: str - ) -> str: +) -> str: s = matchobj.group(0).split(", ")[-1].rstrip("*"+quotechr) if s in dic_translate_tree.keys(): return quotechr+dic_translate_tree[s]+quotechr - else: + else: return matchobj.group(0) + def replacing_labels( tree_string: str, dic_translate_tree: dict, - quotechr = "'" + quotechr: str = "'" ): - return re.sub("'[^,]*, [0-9]*\*?'", lambda match: replacement(match, dic_translate_tree, quotechr), tree_string) + return re.sub(r"'[^,]*, [0-9]*\*?'", lambda match: replacement(match, dic_translate_tree, quotechr), tree_string) def adapt_phylogenetic_tree_to_counts_df( new_otu_id_name_ser: pd.Series, tree_file: str, output_tree_file: str, - quotechr = "'" - ): + quotechr: str = "'" +): """ Translate phylogenetic tree labels to names present in a counts dataframe using the txid as key Args: - - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3) - - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be '{species name}, {txid}' or '{species name}, {txid}*' + - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column + ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3) + - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be + '{species name}, {txid}' or '{species name}, {txid}*' - output_tree_file: path to the output adapted tree file - quotechr: quote character used as delimiter of labels in tree """ @@ -59,4 +61,4 @@ def adapt_phylogenetic_tree_to_counts_df( outfile = open(output_tree_file, "w") outfile.write(replacing_labels(T, dic_translate_tree, quotechr)) - outfile.close() \ No newline at end of file + outfile.close() diff --git a/tests/utils/test_phylogenetic_tree_editing.py b/tests/utils/test_phylogenetic_tree_editing.py index 9fc179e7..39122303 100644 --- a/tests/utils/test_phylogenetic_tree_editing.py +++ b/tests/utils/test_phylogenetic_tree_editing.py @@ -1,4 +1,3 @@ -from cgi import test import pandas as pd from unittest import TestCase @@ -37,11 +36,11 @@ def test_replacing_labels(self): ('Prevotella sp. oral taxon 473, 712469':0.5,\ 'Enterococcus lactis, 357441':0.05):1)root;\n" tested_string = replacing_labels( - tree_string, + tree_string, { - '109790': 'Lactobacillus_jensenii', - '147802': 'Lactobacillus_iners', - '712469': 'Alloprevotella_Prevotella sp. oral taxon 473', + '109790': 'Lactobacillus_jensenii', + '147802': 'Lactobacillus_iners', + '712469': 'Alloprevotella_Prevotella sp. oral taxon 473', '1263085': 'Lactobacillus_ruminis CAG:367' } ) @@ -50,4 +49,4 @@ def test_replacing_labels(self): 'Lactobacillus_ruminis CAG:367':1):0.5,\ ('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\ 'Enterococcus lactis, 357441':0.05):1)root;\n" - self.assertEqual(tested_string, expected_string) \ No newline at end of file + self.assertEqual(tested_string, expected_string) From ec34836941f1c2adf19bfbbf58a09dd3006d697a Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Thu, 2 Feb 2023 15:03:18 +0100 Subject: [PATCH 3/6] removing python 3.6 from setup and adding 3.9/3.10 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4b3976b4..a7f7303f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8, 3.9, 3.10] steps: - uses: actions/checkout@v2 From 85bc534af5d9e8f1dd3e497a1293dce9dee31319 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Thu, 2 Feb 2023 15:05:08 +0100 Subject: [PATCH 4/6] better with quote --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a7f7303f..93807e5d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8, 3.9, 3.10] + python-version: ["3.7", "3.8", "3.9", "3.10"] steps: - uses: actions/checkout@v2 From fec9a50d6d8b55e8ad5046e480a30eba972691e0 Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Thu, 2 Feb 2023 17:07:34 +0100 Subject: [PATCH 5/6] mission aborted --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 93807e5d..b5b393df 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8"] steps: - uses: actions/checkout@v2 From 63276d46da79e8d0e9b770baf9cb4b2b986bd67d Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Thu, 2 Feb 2023 18:11:16 +0100 Subject: [PATCH 6/6] tree as string can be given and returned --- moonstone/utils/phylogenetic_tree_editing.py | 29 ++++++++++++------- tests/utils/test_phylogenetic_tree_editing.py | 25 ++++++++++++---- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/moonstone/utils/phylogenetic_tree_editing.py b/moonstone/utils/phylogenetic_tree_editing.py index 59f816af..208cc072 100644 --- a/moonstone/utils/phylogenetic_tree_editing.py +++ b/moonstone/utils/phylogenetic_tree_editing.py @@ -40,8 +40,8 @@ def replacing_labels( def adapt_phylogenetic_tree_to_counts_df( new_otu_id_name_ser: pd.Series, - tree_file: str, - output_tree_file: str, + tree: str, + output_tree_file: str = None, quotechr: str = "'" ): """ @@ -49,16 +49,25 @@ def adapt_phylogenetic_tree_to_counts_df( Args: - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3) - - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be + - tree: path to the tree file to adapt or tree as a string. The format of the tree leaves labels should be '{species name}, {txid}' or '{species name}, {txid}*' - - output_tree_file: path to the output adapted tree file + - output_tree_file: path to the output adapted tree file. + If None, then function return the adaptated tree as a string - quotechr: quote character used as delimiter of labels in tree """ + try: + infile = open(tree, "r") + T = infile.read() + infile.close() + except FileNotFoundError: + T = tree + dic_translate_tree = generate_translation_dictionary(new_otu_id_name_ser) - infile = open(tree_file, "r") - T = infile.read() - infile.close() + T = replacing_labels(T, dic_translate_tree, quotechr) - outfile = open(output_tree_file, "w") - outfile.write(replacing_labels(T, dic_translate_tree, quotechr)) - outfile.close() + if output_tree_file: + outfile = open(output_tree_file, "w") + outfile.write(T) + outfile.close() + else: + return T diff --git a/tests/utils/test_phylogenetic_tree_editing.py b/tests/utils/test_phylogenetic_tree_editing.py index 39122303..bdede51e 100644 --- a/tests/utils/test_phylogenetic_tree_editing.py +++ b/tests/utils/test_phylogenetic_tree_editing.py @@ -4,13 +4,14 @@ from moonstone.utils.phylogenetic_tree_editing import ( generate_translation_dictionary, - replacing_labels + replacing_labels, + adapt_phylogenetic_tree_to_counts_df ) class TestPhylogeneticTreeAdaptation(TestCase): - def test_generate_translation_dictionary(self): - count_df = pd.DataFrame( + def setUp(self): + self.count_df = pd.DataFrame( [ ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillales (order)', 'Lactobacillales (order)', 'Lactobacillales (order)', 186826, 4.3], # noqa ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_jensenii', 109790, 1.0], # noqa @@ -21,12 +22,14 @@ def test_generate_translation_dictionary(self): 'NCBI_taxonomy_ID', 'SAMPLE_1' ] ) - count_df = count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']) + self.count_df = self.count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']) + + def test_generate_translation_dictionary(self): expected_dict = { '109790': 'Lactobacillus_jensenii', '147802': 'Lactobacillus_iners', } - tested_dict = generate_translation_dictionary(count_df['NCBI_taxonomy_ID']) + tested_dict = generate_translation_dictionary(self.count_df['NCBI_taxonomy_ID']) self.assertDictEqual(tested_dict, expected_dict) def test_replacing_labels(self): @@ -50,3 +53,15 @@ def test_replacing_labels(self): ('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\ 'Enterococcus lactis, 357441':0.05):1)root;\n" self.assertEqual(tested_string, expected_string) + + def test_adapt_phylogenetic_tree_to_counts_df(self): + tree_string = "((('Lactobacillus jensenii, 109790':0.35,\ +'Lactobacillus iners, 147802':0.15):0.75,\ +'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n" + tested_string = adapt_phylogenetic_tree_to_counts_df( + self.count_df['NCBI_taxonomy_ID'], tree_string + ) + expected_string = "((('Lactobacillus_jensenii':0.35,\ +'Lactobacillus_iners':0.15):0.75,\ +'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n" + self.assertEqual(tested_string, expected_string)