-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #98 from motleystate/97-adapt-phylogenetic-tree
Adapt Phylogenetic Tree to count dataframe
- Loading branch information
Showing
4 changed files
with
145 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
"""Adapt Phylogenetic Tree to counts dataframe""" | ||
|
||
import pandas as pd | ||
import re | ||
|
||
|
||
def generate_translation_dictionary( | ||
new_otu_id_name_ser: pd.Series | ||
): | ||
""" | ||
Args: | ||
- new_otu_id_name_ser: pd.Series issued from kraken 2 count dataframe with only new_otu_id_name column | ||
""" | ||
level = "species" # only works with species for now | ||
new_otu_id_name_ser.index = new_otu_id_name_ser.index.get_level_values(level) | ||
new_otu_id_name_ser = new_otu_id_name_ser[~new_otu_id_name_ser.index.str.contains("(", regex=False)].astype(str) | ||
dic_translate_tree = new_otu_id_name_ser.reset_index().set_index(new_otu_id_name_ser.name).to_dict()[level] | ||
return dic_translate_tree | ||
|
||
|
||
def replacement( | ||
matchobj, | ||
dic_translate_tree: dict, | ||
quotechr: str | ||
) -> str: | ||
s = matchobj.group(0).split(", ")[-1].rstrip("*"+quotechr) | ||
if s in dic_translate_tree.keys(): | ||
return quotechr+dic_translate_tree[s]+quotechr | ||
else: | ||
return matchobj.group(0) | ||
|
||
|
||
def replacing_labels( | ||
tree_string: str, | ||
dic_translate_tree: dict, | ||
quotechr: str = "'" | ||
): | ||
return re.sub(r"'[^,]*, [0-9]*\*?'", lambda match: replacement(match, dic_translate_tree, quotechr), tree_string) | ||
|
||
|
||
def adapt_phylogenetic_tree_to_counts_df( | ||
new_otu_id_name_ser: pd.Series, | ||
tree: str, | ||
output_tree_file: str = None, | ||
quotechr: str = "'" | ||
): | ||
""" | ||
Translate phylogenetic tree labels to names present in a counts dataframe using the txid as key | ||
Args: | ||
- new_otu_id_name_ser: pd.Series issued from count dataframe with only new_otu_id_name column | ||
('NCBI_taxonomy_ID' for Kraken2, 'NCBI_tax_id' for Metaphlan3) | ||
- tree: path to the tree file to adapt or tree as a string. The format of the tree leaves labels should be | ||
'{species name}, {txid}' or '{species name}, {txid}*' | ||
- output_tree_file: path to the output adapted tree file. | ||
If None, then function return the adaptated tree as a string | ||
- quotechr: quote character used as delimiter of labels in tree | ||
""" | ||
try: | ||
infile = open(tree, "r") | ||
T = infile.read() | ||
infile.close() | ||
except FileNotFoundError: | ||
T = tree | ||
|
||
dic_translate_tree = generate_translation_dictionary(new_otu_id_name_ser) | ||
T = replacing_labels(T, dic_translate_tree, quotechr) | ||
|
||
if output_tree_file: | ||
outfile = open(output_tree_file, "w") | ||
outfile.write(T) | ||
outfile.close() | ||
else: | ||
return T |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import pandas as pd | ||
|
||
from unittest import TestCase | ||
|
||
from moonstone.utils.phylogenetic_tree_editing import ( | ||
generate_translation_dictionary, | ||
replacing_labels, | ||
adapt_phylogenetic_tree_to_counts_df | ||
) | ||
|
||
|
||
class TestPhylogeneticTreeAdaptation(TestCase): | ||
def setUp(self): | ||
self.count_df = pd.DataFrame( | ||
[ | ||
['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillales (order)', 'Lactobacillales (order)', 'Lactobacillales (order)', 186826, 4.3], # noqa | ||
['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_jensenii', 109790, 1.0], # noqa | ||
['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners', 147802, 3.5] # noqa | ||
], | ||
columns=[ | ||
'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', | ||
'NCBI_taxonomy_ID', 'SAMPLE_1' | ||
] | ||
) | ||
self.count_df = self.count_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']) | ||
|
||
def test_generate_translation_dictionary(self): | ||
expected_dict = { | ||
'109790': 'Lactobacillus_jensenii', | ||
'147802': 'Lactobacillus_iners', | ||
} | ||
tested_dict = generate_translation_dictionary(self.count_df['NCBI_taxonomy_ID']) | ||
self.assertDictEqual(tested_dict, expected_dict) | ||
|
||
def test_replacing_labels(self): | ||
tree_string = "((('Lactobacillus jensenii, 109790':0.35,\ | ||
'Lactobacillus iners, 147802':0.15):0.75,\ | ||
'Lactobacillus ruminis CAG:367, 1263085*':1):0.5,\ | ||
('Prevotella sp. oral taxon 473, 712469':0.5,\ | ||
'Enterococcus lactis, 357441':0.05):1)root;\n" | ||
tested_string = replacing_labels( | ||
tree_string, | ||
{ | ||
'109790': 'Lactobacillus_jensenii', | ||
'147802': 'Lactobacillus_iners', | ||
'712469': 'Alloprevotella_Prevotella sp. oral taxon 473', | ||
'1263085': 'Lactobacillus_ruminis CAG:367' | ||
} | ||
) | ||
expected_string = "((('Lactobacillus_jensenii':0.35,\ | ||
'Lactobacillus_iners':0.15):0.75,\ | ||
'Lactobacillus_ruminis CAG:367':1):0.5,\ | ||
('Alloprevotella_Prevotella sp. oral taxon 473':0.5,\ | ||
'Enterococcus lactis, 357441':0.05):1)root;\n" | ||
self.assertEqual(tested_string, expected_string) | ||
|
||
def test_adapt_phylogenetic_tree_to_counts_df(self): | ||
tree_string = "((('Lactobacillus jensenii, 109790':0.35,\ | ||
'Lactobacillus iners, 147802':0.15):0.75,\ | ||
'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n" | ||
tested_string = adapt_phylogenetic_tree_to_counts_df( | ||
self.count_df['NCBI_taxonomy_ID'], tree_string | ||
) | ||
expected_string = "((('Lactobacillus_jensenii':0.35,\ | ||
'Lactobacillus_iners':0.15):0.75,\ | ||
'Lactobacillus ruminis CAG:367, 1263085*':1)root;\n" | ||
self.assertEqual(tested_string, expected_string) |