From 63276d46da79e8d0e9b770baf9cb4b2b986bd67d Mon Sep 17 00:00:00 2001 From: Agnes BAUD Date: Thu, 2 Feb 2023 18:11:16 +0100 Subject: [PATCH] tree as string can be given and returned --- moonstone/utils/phylogenetic_tree_editing.py | 29 ++++++++++++------- tests/utils/test_phylogenetic_tree_editing.py | 25 ++++++++++++---- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/moonstone/utils/phylogenetic_tree_editing.py b/moonstone/utils/phylogenetic_tree_editing.py index 59f816af..208cc072 100644 --- a/moonstone/utils/phylogenetic_tree_editing.py +++ b/moonstone/utils/phylogenetic_tree_editing.py @@ -40,8 +40,8 @@ def replacing_labels( def adapt_phylogenetic_tree_to_counts_df( new_otu_id_name_ser: pd.Series, - tree_file: str, - output_tree_file: str, + tree: str, + output_tree_file: str = None, quotechr: str = "'" ): """ @@ -49,16 +49,25 @@ def adapt_phylogenetic_tree_to_counts_df( Args: - new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column ('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3) - - tree_file: path to the tree file to adapt. The format of the tree leaves labels should be + - tree: path to the tree file to adapt or tree as a string. The format of the tree leaves labels should be '{species name}, {txid}' or '{species name}, {txid}*' - - output_tree_file: path to the output adapted tree file + - output_tree_file: path to the output adapted tree file. + If None, then function return the adaptated tree as a string - quotechr: quote character used as delimiter of labels in tree """ + try: + infile = open(tree, "r") + T = infile.read() + infile.close() + except FileNotFoundError: + T = tree + dic_translate_tree = generate_translation_dictionary(new_otu_id_name_ser) - infile = open(tree_file, "r") - T = infile.read() - infile.close() + T = replacing_labels(T, dic_translate_tree, quotechr) - outfile = open(output_tree_file, "w") - outfile.write(replacing_labels(T, dic_translate_tree, quotechr)) - outfile.close() + if output_tree_file: + outfile = open(output_tree_file, "w") + outfile.write(T) + outfile.close() + else: + return T diff --git a/tests/utils/test_phylogenetic_tree_editing.py b/tests/utils/test_phylogenetic_tree_editing.py index 39122303..bdede51e 100644 --- a/tests/utils/test_phylogenetic_tree_editing.py +++ b/tests/utils/test_phylogenetic_tree_editing.py @@ -4,13 +4,14 @@ from moonstone.utils.phylogenetic_tree_editing import ( generate_translation_dictionary, - replacing_labels + replacing_labels, + adapt_phylogenetic_tree_to_counts_df ) class TestPhylogeneticTreeAdaptation(TestCase): - def test_generate_translation_dictionary(self): - count_df = pd.DataFrame( + def setUp(self): + self.count_df = pd.DataFrame( [ ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillales (order)', 'Lactobacillales (order)', 'Lactobacillales (order)', 186826, 4.3], # noqa ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_jensenii', 109790, 1.0], # noqa @@ -21,12 +22,14 @@ def test_generate_translation_dictionary(self): 'NCBI_taxonomy_ID', 'SAMPLE_1' ] ) - count_df = count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']) + self.count_df = self.count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']) + + def test_generate_translation_dictionary(self): expected_dict = { '109790': 'Lactobacillus_jensenii', '147802': 'Lactobacillus_iners', } - tested_dict = generate_translation_dictionary(count_df['NCBI_taxonomy_ID']) + tested_dict = generate_translation_dictionary(self.count_df['NCBI_taxonomy_ID']) self.assertDictEqual(tested_dict, expected_dict) def test_replacing_labels(self): @@ -50,3 +53,15 @@ def test_replacing_labels(self): ('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\ 'Enterococcus lactis, 357441':0.05):1)root;\n" self.assertEqual(tested_string, expected_string) + + def test_adapt_phylogenetic_tree_to_counts_df(self): + tree_string = "((('Lactobacillus jensenii, 109790':0.35,\ +'Lactobacillus iners, 147802':0.15):0.75,\ +'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n" + tested_string = adapt_phylogenetic_tree_to_counts_df( + self.count_df['NCBI_taxonomy_ID'], tree_string + ) + expected_string = "((('Lactobacillus_jensenii':0.35,\ +'Lactobacillus_iners':0.15):0.75,\ +'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n" + self.assertEqual(tested_string, expected_string)