diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 68b83ed..1c3212b 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: [3.7]
+ python-version: [3.9]
steps:
- uses: actions/checkout@v2
@@ -19,9 +19,9 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
- python -m pip install --upgrade pip
+ python -m pip install --upgrade pip setuptools wheel
pip install pytest pytest-cov
- pip install numpy==1.18.1
+ pip install numpy==1.24.3
pip install .
pip install odfpy # optional dependencies
pip install openpyxl # idem
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 4e52bcb..3a5546a 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.7", "3.8"]
+ python-version: ["3.8", "3.9"]
steps:
- uses: actions/checkout@v2
@@ -22,9 +22,9 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
- python -m pip install --upgrade pip
+ python -m pip install --upgrade pip setuptools wheel
pip install flake8 pytest
- pip install numpy==1.18.1
+ pip install numpy==1.24.3
pip install .
pip install odfpy # optional dependencies
pip install openpyxl # idem
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index d4ba985..92ea8e9 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -17,7 +17,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
- python-version: '3.7'
+ python-version: '3.9'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
diff --git a/moonstone/analysis/differential_analysis.py b/moonstone/analysis/differential_analysis.py
index c78eb35..9387200 100644
--- a/moonstone/analysis/differential_analysis.py
+++ b/moonstone/analysis/differential_analysis.py
@@ -1,7 +1,6 @@
import logging
import pandas as pd
-import numpy as np
import scipy.stats as st
from statsmodels.stats.multitest import multipletests
@@ -53,8 +52,8 @@ def test_dichotomic_features(self, feature, test_to_use):
cat1 = self.full_table[self.full_table[feature] == self.full_table[feature][0]]
cat2 = self.full_table[self.full_table[feature] != self.full_table[feature][0]]
for family in range(self.number_columns_to_skip, self.full_table.shape[1]):
- test = self.tests_functions_used[test_to_use](cat1[self.full_table.columns[family]],
- cat2[self.full_table.columns[family]])
+ test = self.tests_functions_used[test_to_use](cat1[self.full_table.columns[family]].astype(float),
+ cat2[self.full_table.columns[family]].astype(float))
features.append(feature)
taxons.append(self.full_table.columns[family])
static_value.append(round(test[0], 6))
@@ -79,7 +78,8 @@ def test_multiple_features(self, feature, test_to_use):
list_ofgroups = []
for variable in variable_dic:
list_ofgroups.append(variable_dic[variable][self.full_table.columns[family]])
- test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups))
+ #test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups))
+ test = self.tests_functions_used[test_to_use](*list_ofgroups) # works for kruskal and one way anova
features.append(feature)
taxons.append(self.full_table.columns[family])
static_values.append(round(test[0], 6))
@@ -114,5 +114,5 @@ def differential_analysis_by_feature(self, features, type_of_features, test_to_u
for feature in features:
test_result = getattr(self, f"test_{type_of_features}", self.test_default)(feature, test_to_use)
test_result['corrected_p-value'] = self.corrected_p_values(test_result['p-value'], correction_method_used)
- final_table = final_table.append(test_result)
+ final_table = pd.concat([final_table, test_result])
return final_table
diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
index 62f191a..97bfa1c 100644
--- a/moonstone/analysis/diversity/base.py
+++ b/moonstone/analysis/diversity/base.py
@@ -194,8 +194,8 @@ def _run_statistical_test_groups(
corrected_pval.index = pval.dropna().index # postulate that the order hasn't changed
if pval[pval.isnull()].size > 0:
- corrected_pval = corrected_pval.append(pval[pval.isnull()])
-
+ # corrected_pval = corrected_pval.append(pval[pval.isnull()])
+ corrected_pval = pd.concat([corrected_pval, pval[pval.isnull()]])
# remodelling of p-values output
corrected_pval = self._structure_remodelling(corrected_pval, structure=structure_pval, sym=sym)
return corrected_pval
@@ -240,7 +240,7 @@ def _compute_pval_inside_subgroups(
self, diversity_index_dataframe: pd.DataFrame, group_col: str, final_group_col: str,
stats_test: str, correction_method: str, structure_pval: str, sym: bool
):
- pval = pd.Series([])
+ pval = pd.Series([], dtype='float64')
for g in diversity_index_dataframe[group_col].dropna().unique():
df_gp = diversity_index_dataframe[diversity_index_dataframe[group_col] == g]
if df_gp.shape[0] < 2:
@@ -248,10 +248,13 @@ def _compute_pval_inside_subgroups(
f"Less than 2 samples in dataframe group {g} in data. P-val can't be computed."
)
else:
- pval = pval.append(self._run_statistical_test_groups(
- df_gp, final_group_col, stats_test,
- correction_method, structure_pval, sym
- ))
+ pval = pd.concat([
+ pval,
+ self._run_statistical_test_groups(
+ df_gp, final_group_col, stats_test,
+ correction_method, structure_pval, sym
+ )
+ ])
pval.index = pd.MultiIndex.from_tuples(pval.index, names=('Group1', 'Group2'))
return pval
@@ -317,12 +320,13 @@ def analyse_groups(
df, group_col, final_group_col, stats_test, correction_method, structure_pval, sym
)
if pval_to_compute == "same group_col or group_col2 values":
- pval = pval.append(
+ pval = pd.concat([
+ pval,
self._compute_pval_inside_subgroups(
df, group_col2, final_group_col,
stats_test, correction_method, structure_pval, sym
)
- )
+ ])
else:
df = self._get_grouped_df(filtered_metadata_df[group_col])
@@ -359,7 +363,7 @@ def analyse_groups(
# 'data' different from 'diversity indexes' in the fact that it has been filtered on metadata, meaning that
# samples without metadata for group_col (or group_col2) have been dropped
- return{**{'data': df}, **self.report_data['analyse_groups']}
+ return {**{'data': df}, **self.report_data['analyse_groups']}
def generate_report_data(self) -> dict:
"""
diff --git a/moonstone/analysis/statistical_test.py b/moonstone/analysis/statistical_test.py
index 994b436..c1b2ffb 100644
--- a/moonstone/analysis/statistical_test.py
+++ b/moonstone/analysis/statistical_test.py
@@ -14,6 +14,15 @@
def _preprocess_groups_comparison(
series: pd.Series, group_series: pd.Series, stat_test: str
):
+ # If samples in group_series/metadata but not in series/count_dataframe
+ # then we need to remove them from the group_series/metadata
+ # to not get an error like "None of [Index(['sample7'], dtype='object')] are in the [index]"
+ group_series_index_to_keep = group_series.index.intersection(series.index)
+ if len(group_series_index_to_keep) != len(group_series.index):
+ logger.info(
+ "Some index values in group_series aren't found in the series. Dropping those rows."
+ )
+ group_series = group_series.loc[group_series_index_to_keep]
groups = list(group_series.unique())
groups.sort()
diff --git a/moonstone/parsers/base.py b/moonstone/parsers/base.py
index d6487c8..323d111 100644
--- a/moonstone/parsers/base.py
+++ b/moonstone/parsers/base.py
@@ -52,8 +52,10 @@ def _load_data(self) -> pd.DataFrame:
"xlsb": "pyxlsb" # Binary Excel files
}
if ext in ext_engine.keys():
+ if self.header == "infer":
+ self.header = 0 # "infer" not accepted with read_excel anymore
return pd.read_excel(
- self.file_path, sep=self.sep, header=self.header, **self.parsing_options,
+ self.file_path, header=self.header, **self.parsing_options,
engine=ext_engine[ext]
)
return pd.read_csv(
diff --git a/moonstone/parsers/counts/taxonomy/metaphlan.py b/moonstone/parsers/counts/taxonomy/metaphlan.py
index 60039da..da13136 100644
--- a/moonstone/parsers/counts/taxonomy/metaphlan.py
+++ b/moonstone/parsers/counts/taxonomy/metaphlan.py
@@ -1,7 +1,11 @@
-from pandas import DataFrame
+import logging
+
+import pandas as pd
from moonstone.parsers.counts.taxonomy.base import BaseTaxonomyCountsParser
+logger = logging.getLogger(__name__)
+
class BaseMetaphlanParser(BaseTaxonomyCountsParser):
@@ -9,21 +13,36 @@ def __init__(self, *args, analysis_type: str = 'rel_ab', **kwargs):
"""
Args:
analysis_type: output type of Metaphlan3 (see ``-t`` option of metaphlan3)
+ { 'rel_ab', 'rel_ab_w_read_stats', 'reads_map', 'clade_profiles', 'marker_ab_table', 'marker_counts',
+ 'marker_pres_table', 'clade_specific_strain_tracker' }
"""
- self.analysis_type = analysis_type
+ self.analysis_type = self._valid_analysis_type(analysis_type)
super().__init__(*args, **kwargs)
- def rows_differences(self, dataframe1, dataframe2) -> DataFrame:
+ def _valid_analysis_type(self, analysis_type):
+ choices = [
+ "rel_ab", "rel_ab_w_read_stats", "reads_map", "clade_profiles",
+ "marker_ab_table", "marker_counts", "marker_pres_table", "clade_specific_strain_tracker"
+ ]
+ if analysis_type not in choices:
+ logger.warning("analysis_type='%s' not valid, set to default ('rel_ab').", analysis_type)
+ analysis_type = "rel_ab"
+ return analysis_type
+
+ def rows_differences(self, dataframe1, dataframe2) -> pd.DataFrame:
rows_diff = dataframe1 - dataframe2
rows_diff[rows_diff.isnull()] = dataframe1
if self.analysis_type == 'rel_ab':
rows_diff[rows_diff < 0.0001] = 0
+ # if difference between sum of organism of rank r (ex: sum of species of genus X)
+ # and value of rank r+1 (ex:genus X) is so small,
+ # we assume that it's due to python addition approximation with decimal
else:
rows_diff[rows_diff < 0] = 0
rows_diff = rows_diff.loc[rows_diff.sum(axis=1)[rows_diff.sum(axis=1) != 0].index]
return rows_diff
- def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, rank) -> DataFrame:
+ def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, rank) -> pd.DataFrame:
df_rank = whole_df[whole_df.index.map(lambda x: len(x.split('|'))) == rank]
# transformation lower_level to rank (level)
@@ -32,7 +51,22 @@ def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, ran
df_rank_computed = df_rank_computed.groupby(df_rank_computed.index).sum() # grouping by rank (level)
return self.rows_differences(df_rank, df_rank_computed)
- def remove_duplicates(self, df) -> DataFrame:
+ def remove_duplicates(self, df) -> pd.DataFrame:
+ """
+ Metaphlan3 results are by level therefore we need to remove the duplicated informations
+ Example:
+ We have:
+ ...|g_GenusA 50.0
+ ...|g_GenusA|s_Species1 30.0
+ ...|g_GenusB 50.0
+ ...|g_GenusB|s_Species2 50.0
+ Sum = 180.0 =/= 100.0 (while it's relative abundance -> but same problem with other analysis type)
+ We want:
+ ...|g_GenusA|s_GenusA (genus) 20.0 # unspecified species
+ ...|g_GenusA|s_Species1 30.0
+ ...|g_GenusB|s_Species2 50.0
+ Sum = 100.0
+ """
df = df.set_index(self.taxa_column)
# dataframe at rank level
@@ -56,8 +90,8 @@ def remove_duplicates(self, df) -> DataFrame:
rank -= 1
rows_diff = self.compare_difference_between_two_levels(df, new_df, rank)
if rows_diff.size != 0:
- new_df = new_df.append(rows_diff) # add missing rows to the dataframe of the lower level
-
+ # new_df = new_df.append(rows_diff) # add missing rows to the dataframe of the lower level
+ new_df = pd.concat([new_df, rows_diff]) # add missing rows to the dataframe of the lower level
# verification that everything is defined up to the lower_level
samples_with_incomp_lowerlevel = new_df.sum()[new_df.sum() < total]
@@ -72,7 +106,7 @@ class Metaphlan2Parser(BaseMetaphlanParser):
taxa_column = 'ID'
- def _load_data(self) -> DataFrame:
+ def _load_data(self) -> pd.DataFrame:
df = super()._load_data()
df = self.remove_duplicates(df)
df = self.split_taxa_fill_none(df, sep="|")
@@ -88,17 +122,37 @@ class Metaphlan3Parser(BaseMetaphlanParser):
taxa_column = 'clade_name'
NCBI_tax_column = 'NCBI_tax_id'
- def __init__(self, *args, analysis_type: str = 'rel_ab', **kwargs):
+ def __init__(self, *args, analysis_type: str = 'rel_ab', keep_NCBI_tax_col: bool = False, **kwargs):
"""
Args:
analysis_type: output type of Metaphlan3 (see ``-t`` option of metaphlan3)
+ { 'rel_ab', 'rel_ab_w_read_stats', 'reads_map', 'clade_profiles', 'marker_ab_table', 'marker_counts',
+ 'marker_pres_table', 'clade_specific_strain_tracker' }
+ keep_NCBI_tax_col: set to True if you want the NCBI tax column in the returned dataframe.
"""
+ self.keep_NCBI_tax_col = keep_NCBI_tax_col
super().__init__(*args, analysis_type=analysis_type, parsing_options={'skiprows': 1}, **kwargs)
- def _load_data(self) -> DataFrame:
+ def _load_data(self) -> pd.DataFrame:
df = super()._load_data()
- df = df.drop(self.NCBI_tax_column, axis=1)
+
+ # if number of taxonomical_names is inferior to the default,
+ if len(self.taxonomical_names) < len(BaseTaxonomyCountsParser.taxonomical_names):
+ # we need to restrict the rows considered to only the rows that recount taxonomical level inside the range
+ # wanted.
+ # Or error "ValueError: Error : expecting a integer inferior or equal to the number of taxonomical_names."
+ # will be raised
+ df = df[df["NCBI_tax_id"].map(lambda x: len(x.split("|"))) <= len(self.taxonomical_names)]
+ if self.keep_NCBI_tax_col:
+ tmp = df[[self.NCBI_tax_column, self.taxa_column]]
+
+ df = df.drop(self.NCBI_tax_column, axis=1) # NCBI_tax_column needs to be dropped because sum
df = self.remove_duplicates(df)
+
+ if self.keep_NCBI_tax_col:
+ tmp[self.NCBI_tax_column] = tmp[self.NCBI_tax_column].map(lambda x: x.split("|")[-1])
+ df = df.merge(tmp)
+
df = self.split_taxa_fill_none(df, sep="|")
df = df.set_index(self.taxonomical_names[:self.rank_level])
return df
diff --git a/moonstone/plot/counts.py b/moonstone/plot/counts.py
index ac0d67d..c31b364 100644
--- a/moonstone/plot/counts.py
+++ b/moonstone/plot/counts.py
@@ -456,7 +456,8 @@ def _plot_most_what_taxa_boxplot_or_violin(
tmp = relab_df_taxa[i].reset_index()
tmp.index = nb * [i]
tmp.columns = ["species", "relative abundance"]
- relab_df_taxa2 = relab_df_taxa2.append(tmp)
+ # relab_df_taxa2 = relab_df_taxa2.append(tmp)
+ relab_df_taxa2 = pd.concat([relab_df_taxa2, tmp])
relab_df_taxa2.species = relab_df_taxa2.species.apply(self._italicize_taxa_name)
groups = [self._italicize_taxa_name(name) for name in groups]
@@ -747,7 +748,8 @@ def plot_sample_composition_most_abundant_taxa(
# Make graph
graph = MatrixBarGraph(data_df)
# Plotting options
- title = f"{taxa_level.capitalize()} composition for the top {taxa_number} most abundant species across samples"
+ title = f"{taxa_level.capitalize()} composition for the top {taxa_number} most abundant {taxa_level} across \
+samples"
if prevalence_threshold is not None:
title += f" (present in at least {prevalence_threshold}% of samples)"
diff --git a/moonstone/plot/graphs/base.py b/moonstone/plot/graphs/base.py
index 511a9f1..5183fdc 100644
--- a/moonstone/plot/graphs/base.py
+++ b/moonstone/plot/graphs/base.py
@@ -236,7 +236,7 @@ def plot_one_graph(
if groups:
filtered_df = self.data[self.data[group_col].isin(groups)]
filtered_df[group_col] = filtered_df[group_col].astype("category")
- filtered_df[group_col].cat.set_categories(groups, inplace=True)
+ filtered_df[group_col].cat = filtered_df[group_col].cat.set_categories(groups)
filtered_df = filtered_df.sort_values([group_col])
else:
filtered_df = copy.deepcopy(self.data)
@@ -247,7 +247,7 @@ def plot_one_graph(
fig,
filtered_df2[group_col],
filtered_df2[data_col],
- names[group],
+ str(names[group]),
filtered_df.index,
self._get_group_color(group, colors),
orientation,
@@ -265,7 +265,7 @@ def plot_one_graph(
fig,
filtered_df[group_col],
filtered_df[data_col],
- names[group],
+ str(names[group]),
filtered_df.index,
self._get_group_color(group, colors),
orientation,
diff --git a/moonstone/utils/df_merge.py b/moonstone/utils/df_merge.py
index dae1055..400e9c4 100644
--- a/moonstone/utils/df_merge.py
+++ b/moonstone/utils/df_merge.py
@@ -1,6 +1,5 @@
import logging
import pandas as pd
-import numpy as np
logger = logging.getLogger(__name__)
@@ -26,10 +25,15 @@ def merge(self):
logger.info('Merge function called to merge count data and metadata.')
logger.info(f'Variable {self.variable} from metadata file will be merged with counts.')
- if not isinstance(self.dc.index, type(self.dm.index)):
- logger.warning(f'Index types do not match: {type(self.dc.index)} and {type(self.dm.index)}.')
- self.dc.set_index(np.int64(np.array(self.dc.index)), inplace=True)
- logger.info(f' Indexes reset. Count Index={type(self.dc.index)}, Metadata Index={type(self.dm.index)}')
+ # if not isinstance(self.dc.index, type(self.dm.index)):
+ if self.dc.index.dtype != self.dm.index.dtype:
+ # logger.warning(f'Index types do not match: {type(self.dc.index)} and {type(self.dm.index)}.')
+ # self.dc = self.dc.set_index(np.int64(np.array(self.dc.index)))
+ # logger.info(f' Indexes reset. Count Index={type(self.dc.index)}, Metadata Index={type(self.dm.index)}')
+ logger.warning(f'Index types do not match: {self.dc.index.dtype} and {self.dm.index.dtype}.')
+ self.dc.index = self.dc.index.astype(str)
+ self.dm.index = self.dm.index.astype(str)
+ logger.warning('Both Count and Metadata Indexes set as string')
df = pd.merge(self.dm[self.variable], self.dc, left_index=True, right_index=True)
logger.info('Merge function completed. Returning merged data frame.')
diff --git a/moonstone/utils/df_reindex.py b/moonstone/utils/df_reindex.py
index dcdad70..225e530 100644
--- a/moonstone/utils/df_reindex.py
+++ b/moonstone/utils/df_reindex.py
@@ -22,12 +22,19 @@ def __init__(self, dataframe: Union[pd.Series, pd.DataFrame],
self.taxonomy_df = taxonomy_dataframe
self.taxa_column = taxa_column
- def reindex_with_taxonomy(self, method: str = 'sum'):
+ def _sum_at_lowest_level(self, df):
+ df.index = df.index.to_flat_index()
+ df = df.groupby(level=0).sum()
+ df.index = pd.MultiIndex.from_tuples(df.index, names=self.taxonomical_names[:self._rank_level])
+ return df
+
+ def reindex_with_taxonomy(self, method: str = 'sum', na: str = 'drop'):
"""
reindexation on taxonomic information (if there are).
:param method: how to combine genes' information of genes that have the same taxonomy.
Choose 'sum' to sum the counts or 'count' to only have the number of genes with this taxonomy
+ :param na: {'drop' (default), 'keep', 'sum'} what to do with the genes with missing taxonomical information.
NB: You can access the list of items without taxonomic information by checking the .without_info_index
attributes
@@ -48,16 +55,21 @@ def reindex_with_taxonomy(self, method: str = 'sum'):
self.without_info_index = new_df['_merge'].loc[new_df['_merge'] == 'left_only'].index
new_df = new_df.drop(['_merge'], axis=1)
- new_df[self.taxa_column] = new_df[self.taxa_column].fillna(value='k__; p__; c__; o__; f__; g__; s__')
+ if na == 'drop':
+ new_df = new_df.dropna(subset=[self.taxa_column])
+ elif na == 'keep':
+ new_df[self.taxa_column] = new_df[self.taxa_column].fillna(
+ 'k__; p__; c__; o__; f__; g__; s__'+new_df.index.to_series()+'_species'
+ )
+ else: # na == 'sum'
+ new_df[self.taxa_column] = new_df[self.taxa_column].fillna(value='k__; p__; c__; o__; f__; g__; s__')
new_df = self.split_taxa_fill_none(new_df, sep="; ", merge_genus_species=True)
new_df = new_df.set_index(self.taxonomical_names[:self._rank_level])
if method == 'sum':
- nb_levels = len(self.taxonomical_names[:self._rank_level])
- new_df = new_df.sum(level=list(range(nb_levels)))
+ new_df = self._sum_at_lowest_level(new_df)
elif method == 'count':
new_df[:] = np.where(new_df > 0, 1, 0) # presence/absence -> is > 0 then presence (1) else absence (0)
- nb_levels = len(self.taxonomical_names[:self._rank_level])
- new_df = new_df.sum(level=list(range(nb_levels)))
+ new_df = self._sum_at_lowest_level(new_df)
return new_df
@property
diff --git a/moonstone/utils/taxonomy.py b/moonstone/utils/taxonomy.py
index 338862d..c24e66c 100644
--- a/moonstone/utils/taxonomy.py
+++ b/moonstone/utils/taxonomy.py
@@ -85,7 +85,7 @@ def remove_taxo_prefix(string):
taxa_columns.columns = self.taxonomical_names[:self.rank_level]
taxa_columns = taxa_columns.applymap(lambda x: remove_taxo_prefix(x))
if terms_to_remove is not None:
- taxa_columns = taxa_columns.replace(terms_to_remove, np.nan)
+ taxa_columns = taxa_columns.replace(terms_to_remove, None)
if merge_genus_species:
taxa_columns = self._merge_genus_species(taxa_columns)
taxa_columns = self._fill_none(taxa_columns)
diff --git a/requirements.txt b/requirements.txt
index 0d75f1a..9071174 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,7 @@ decorator==4.4.2
# via
# ipython
# scikit-bio
-hdmedians==0.13
+hdmedians==0.14.2
# via
# moonstone (setup.py)
# scikit-bio
@@ -48,7 +48,7 @@ msgpack==1.0.0
# via cachecontrol
natsort==7.0.1
# via scikit-bio
-numpy==1.18.1
+numpy==1.24.3
# via
# hdmedians
# matplotlib
@@ -59,7 +59,7 @@ numpy==1.18.1
# scikit-learn
# scipy
# statsmodels
-pandas==1.0.1
+pandas==2.0.2
# via
# moonstone (setup.py)
# scikit-bio
@@ -74,7 +74,7 @@ pickleshare==0.7.5
# via ipython
pillow==7.2.0
# via matplotlib
-plotly==5.6.0
+plotly==5.17.0
# via moonstone (setup.py)
prompt-toolkit==3.0.7
# via ipython
@@ -100,13 +100,13 @@ requests==2.24.0
# via cachecontrol
retrying==1.3.3
# via plotly
-scikit-bio==0.5.6
+scikit-bio==0.5.9
# via moonstone (setup.py)
-scikit-learn==0.21.3
+scikit-learn==1.3.1
# via
# moonstone (setup.py)
# scikit-bio
-scipy==1.5.2
+scipy==1.9.0
# via
# scikit-bio
# scikit-learn
@@ -118,7 +118,7 @@ six==1.15.0
# plotly
# python-dateutil
# retrying
-statsmodels==0.11.1
+statsmodels==0.13.0
# via moonstone (setup.py)
text-unidecode==1.3
# via python-slugify
diff --git a/setup.py b/setup.py
index 2b5c8c2..13ffa8b 100644
--- a/setup.py
+++ b/setup.py
@@ -9,18 +9,18 @@
author='Kenzo-Hugo Hillion, Agnès Baud, Mariela Furstenheim, Sean Kennedy',
author_email='kehillio@pasteur.fr',
install_requires=[
- 'pandas==1.0.1',
+ 'pandas==2.0.2',
'matplotlib==3.3.0',
- 'plotly==5.6.0',
- 'statsmodels==0.11.1',
+ 'plotly==5.17.0',
+ 'statsmodels==0.13.0',
'python-slugify==4.0.1',
'pyaml==20.4.0',
- 'numpy==1.18.1',
- 'scikit-bio==0.5.6',
- 'scikit-learn==0.21.3',
- 'hdmedians==0.13',
+ 'numpy==1.24.3',
+ 'scikit-bio==0.5.9',
+ 'scikit-learn==1.3.1',
+ 'hdmedians==0.14.2',
'cython==0.29.21',
- 'scipy==1.5.2'
+ 'scipy==1.9.0'
],
packages=find_packages(),
entry_points={'console_scripts': ['moonstone=moonstone.main:run']},
diff --git a/tests/analysis/diversity/test_beta.py b/tests/analysis/diversity/test_beta.py
index 89c25a0..9fa67b2 100644
--- a/tests/analysis/diversity/test_beta.py
+++ b/tests/analysis/diversity/test_beta.py
@@ -35,7 +35,8 @@ def test_compute_beta_diversity_df(self):
)
pd.testing.assert_frame_equal(
tested_object_instance.beta_diversity_df, expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
+ rtol=0.01
+ # check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
)
def test_compute_beta_diversity_series(self):
@@ -54,11 +55,11 @@ def test_compute_beta_diversity_series(self):
# Two ways of retrieving the series
pd.testing.assert_series_equal(
tested_object_instance.beta_diversity_series, expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
+ rtol=0.01
)
pd.testing.assert_series_equal(
tested_object_instance.diversity_indexes, expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
+ rtol=0.01
)
def test_run_statistical_test_groups_with_NaN(self):
@@ -82,7 +83,7 @@ def test_run_statistical_test_groups_with_NaN(self):
'samples14': [9.35, 'A'],
'samples15': [7.89, 'A'],
'samples16': [4.65, 'C'],
- 'samples17': [8.90, 'D'],
+ 'samples17': [8.90, 'D'], # only 1 sample from group D < 5 required to do ttest-independence
'samples18': [2.33, 'C'],
'samples19': [1.34, 'B'],
'samples20': [6.87, 'C']
@@ -109,7 +110,6 @@ def test_run_statistical_test_groups_with_NaN(self):
pd.testing.assert_series_equal(
pval, expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
)
def test_get_grouped_df_series(self):
@@ -131,7 +131,6 @@ def test_get_grouped_df_series(self):
output = tested_object_instance._get_grouped_df_series(metadata_ser)
pd.testing.assert_frame_equal(
output, expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
)
def test_get_grouped_df_dataframe(self):
@@ -172,7 +171,6 @@ def test_get_grouped_df_dataframe(self):
output = tested_object_instance._get_grouped_df_dataframe(metadata_df)
pd.testing.assert_frame_equal(
output, expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
)
def test_analyse_grouped_df(self):
@@ -195,7 +193,6 @@ def test_analyse_grouped_df(self):
output = tested_object_instance.analyse_groups(metadata_df, 'sex', show=False, show_pval=False)
pd.testing.assert_frame_equal(
output['data'], expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
)
def test_analyse_grouped_df_with_group_col2(self):
@@ -238,7 +235,6 @@ def test_analyse_grouped_df_with_group_col2(self):
)
pd.testing.assert_frame_equal(
output["data"], expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
)
@@ -269,7 +265,6 @@ def test_compute_beta_diversity(self):
)
pd.testing.assert_frame_equal(
tested_object_instance.beta_diversity_df, expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
)
def test_compute_beta_diversity_force_computation(self):
@@ -325,7 +320,6 @@ def test_compute_beta_diversity(self):
)
pd.testing.assert_frame_equal(
tested_object_instance.beta_diversity_df, expected_object,
- check_less_precise=2, # Deprecated since version 1.1.0, to be changed when updating pandas
)
def test_compute_beta_diversity_force_computation(self):
diff --git a/tests/parsers/counts/taxonomy/kraken2/test_kraken2.py b/tests/parsers/counts/taxonomy/kraken2/test_kraken2.py
index acf1acb..52c8aeb 100644
--- a/tests/parsers/counts/taxonomy/kraken2/test_kraken2.py
+++ b/tests/parsers/counts/taxonomy/kraken2/test_kraken2.py
@@ -88,4 +88,4 @@ def test_to_dataframe_ods(self):
]
)
expected_df = expected_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
- pd.testing.assert_frame_equal(sunbeamkraken2parser.dataframe, expected_df)
\ No newline at end of file
+ pd.testing.assert_frame_equal(sunbeamkraken2parser.dataframe, expected_df)
diff --git a/tests/parsers/counts/taxonomy/metaphlan3/input.tsv b/tests/parsers/counts/taxonomy/metaphlan3/input.tsv
index d1ffdab..e7995ad 100644
--- a/tests/parsers/counts/taxonomy/metaphlan3/input.tsv
+++ b/tests/parsers/counts/taxonomy/metaphlan3/input.tsv
@@ -8,8 +8,8 @@ k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomyce
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinobaculum 2|201174|1760|2037|2049|76833 1.0 2.0
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinobaculum|s__Actinobaculum_massiliense 2|201174|1760|2037|2049|1654|461393 1.0 2.0
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales 2|1239|91061|186826 9.5 10.3
-k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae 2|1239|91061|186826|33958|1578 3.2 8.0
-k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus 2|1239|91061|186826|33958|1578|1632 3.2 8.0
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae 2|1239|91061|186826|33958 3.2 8.0
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus 2|1239|91061|186826|33958|1578 3.2 8.0
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae 2|1239|91061|186826|1300 6.3 2.3
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus 2|1239|91061|186826|1300|1301 6.3 2.3
k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus|s__Streptococcus_thermophilus 2|1239|91061|186826|1300|1301|1308 1.7 0.7
diff --git a/tests/parsers/counts/taxonomy/metaphlan3/test_metaphlan3.py b/tests/parsers/counts/taxonomy/metaphlan3/test_metaphlan3.py
index 6dd53dc..029e320 100644
--- a/tests/parsers/counts/taxonomy/metaphlan3/test_metaphlan3.py
+++ b/tests/parsers/counts/taxonomy/metaphlan3/test_metaphlan3.py
@@ -9,8 +9,8 @@
class TestMetaphlan2Parser(TestCase):
def setUp(self):
- input_path = os.path.join(os.path.dirname(__file__), 'input.tsv')
- self.meta2parser = Metaphlan3Parser(input_path, analysis_type='marker_counts')
+ self.input_path = os.path.join(os.path.dirname(__file__), 'input.tsv')
+ self.meta2parser = Metaphlan3Parser(self.input_path, analysis_type='marker_counts')
def test_to_dataframe(self):
"""
@@ -33,3 +33,50 @@ def test_to_dataframe(self):
)
expected_df = expected_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
pd.testing.assert_frame_equal(self.meta2parser.dataframe, expected_df, check_like=True)
+
+ def test_to_dataframe_keep_NCBI_tax_col(self):
+ """
+ Test based on input.tsv file
+ """
+ meta2parser = Metaphlan3Parser(self.input_path, analysis_type='rel_ab', keep_NCBI_tax_col=True)
+ expected_df = pd.DataFrame(
+ [
+ ['Bacteria', 'Actinobacteria', 'Actinobacteria', 'Actinomycetales', 'Actinomycetaceae', 'Actinobaculum',
+ 'Actinobaculum_massiliense', 1.0, 2.0, '461393'],
+ ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus',
+ 'Lactobacillus (genus)', 3.2, 8.0, '1578'],
+ ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Streptococcaceae', 'Streptococcus',
+ 'Streptococcus (genus)', 1.3, 0.4, '1301'],
+ ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Streptococcaceae', 'Streptococcus',
+ 'Streptococcus_thermophilus', 1.7, 0.7, '1308'],
+ ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Streptococcaceae', 'Streptococcus',
+ 'Streptococcus_salivarius', 3.3, 1.2, '1304']
+ ],
+ columns=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'SAMPLE_1', 'SAMPLE_2',
+ 'NCBI_tax_id']
+ )
+ expected_df = expected_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
+ observed_df = meta2parser.dataframe
+ pd.testing.assert_frame_equal(observed_df, expected_df, check_like=True)
+
+ def test_to_dataframe_less_taxonomical_names(self):
+ """
+ Test based on input.tsv file
+ """
+ meta2parser = Metaphlan3Parser(self.input_path, analysis_type='rel_ab', keep_NCBI_tax_col=True)
+ meta2parser.taxonomical_names = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus']
+ expected_df = pd.DataFrame(
+ [
+ ['Bacteria', 'Actinobacteria', 'Actinobacteria', 'Actinomycetales', 'Actinomycetaceae', 'Actinobaculum',
+ 1.0, 2.0, '76833'],
+ ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Lactobacillaceae', 'Lactobacillus',
+ 3.2, 8.0, '1578'],
+ ['Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales', 'Streptococcaceae', 'Streptococcus',
+ 6.3, 2.3, '1301'],
+ ],
+ columns=['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'SAMPLE_1', 'SAMPLE_2',
+ 'NCBI_tax_id']
+ )
+ expected_df = expected_df.set_index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus'])
+ observed_df = meta2parser.dataframe
+ pd.testing.assert_frame_equal(observed_df, expected_df, check_like=True)
diff --git a/tests/parsers/counts/taxonomy/test_base_metaphlan.py b/tests/parsers/counts/taxonomy/test_base_metaphlan.py
index 3062afd..0d485ca 100644
--- a/tests/parsers/counts/taxonomy/test_base_metaphlan.py
+++ b/tests/parsers/counts/taxonomy/test_base_metaphlan.py
@@ -266,3 +266,11 @@ def test_remove_duplicates_rel_ab_addition_error_margin(self):
observed_df = self.base_metaphlan_parser.remove_duplicates(tested_df)
pd.testing.assert_frame_equal(observed_df, expected_df, check_like=True)
+
+ def test_valid_analysis_type(self):
+ with self.assertLogs('moonstone.parsers.counts.taxonomy.metaphlan', level='WARNING') as log:
+ tested_object_instance = BaseMetaphlanParser("file", analysis_type="INVALID ANALYSIS TYPE")
+ self.assertEqual(len(log.output), 1)
+ self.assertIn("WARNING:moonstone.parsers.counts.taxonomy.metaphlan:analysis_type='INVALID ANALYSIS TYPE' \
+not valid, set to default ('rel_ab').", log.output)
+ self.assertEqual(tested_object_instance.analysis_type, 'rel_ab')
diff --git a/tests/plot/test_counts.py b/tests/plot/test_counts.py
index 0f51a9c..7d92984 100644
--- a/tests/plot/test_counts.py
+++ b/tests/plot/test_counts.py
@@ -741,7 +741,7 @@ def test_plot_most_prevalent_taxa_modebargraph_plotting_options(self):
expected_x = [75.0, 100.0]
expected_y = [
- "Streptococcus salivarius",
+ "Lactobacillus (genus)", # with Streptococcus_salivarius and Streptococcus (genus) all at 75%
"Streptococcus thermophilus",
]
diff --git a/tests/utils/pandas/test_series.py b/tests/utils/pandas/test_series.py
index 924c8ba..1051a14 100644
--- a/tests/utils/pandas/test_series.py
+++ b/tests/utils/pandas/test_series.py
@@ -74,7 +74,7 @@ def test_build_stats_float(self):
class TestSeriesBinning(TestCase):
- def test_compute_homoogeneous_bins(self):
+ def test_compute_homogeneous_bins(self):
tested_object = pd.Series(
{
'gene_1': 10.5,
@@ -108,6 +108,7 @@ def test_compute_binned_data(self):
expected_object = pd.Series(
[1, 2], index=[']0, 5]', ']5, 10]']
)
+ expected_object.name = "count"
tested_object_instance = SeriesBinning(series)
tested_object_instance.bins_values = [0, 5, 10]
tested_object = tested_object_instance.compute_binned_data()
diff --git a/tests/utils/test_df_merge.py b/tests/utils/test_df_merge.py
index 88d0213..4a741c4 100644
--- a/tests/utils/test_df_merge.py
+++ b/tests/utils/test_df_merge.py
@@ -6,8 +6,8 @@
class TestMergeDF(TestCase):
- def test_merge(self):
- d1 = pd.DataFrame(
+ def setUp(self):
+ self.d1 = pd.DataFrame(
[
[23, 7, 44, 0, 101],
[15, 4, 76, 3, 107],
@@ -15,9 +15,10 @@ def test_merge(self):
[31, 4, 50, 0, 99]
],
columns=['item_1', 'item_2', 'item_3', 'item_4', 'item_5'],
- index=['1', '2', '3', '4'] # index dtype='object'
+ index=[1, 2, 3, 4] # index dtype='object'
)
+ def test_merge(self):
d2 = pd.DataFrame(
[
['M', 'Yes', 23, 'June', 170],
@@ -40,5 +41,37 @@ def test_merge(self):
index=[1, 2, 3, 4] # index dtype='int64'
)
- merged_df = MergeDF(d1, d2, 'sex').merged_df
+ merged_df = MergeDF(self.d1, d2, 'sex').merged_df
+ pd.testing.assert_frame_equal(merged_df, df_expected)
+
+ def test_merge_index_dont_match(self):
+ d2 = pd.DataFrame(
+ [
+ ['M', 'Yes', 23, 'June', 170],
+ ['F', 'Yes', 33, 'Nov', 154],
+ ['F', 'Yes', 29, 'Jan', 161],
+ ['F', 'No', 27, 'Jan', 152]
+ ],
+ columns=['sex', 'pets', 'age', 'sample_month', 'height'],
+ index=['1', '2', '3', '4'] # index dtype='object'
+ )
+
+ df_expected = pd.DataFrame(
+ [
+ ['M', 23, 7, 44, 0, 101],
+ ['F', 15, 4, 76, 3, 107],
+ ['F', 20, 0, 22, 0, 101],
+ ['F', 31, 4, 50, 0, 99]
+ ],
+ columns=['sex', 'item_1', 'item_2', 'item_3', 'item_4', 'item_5'],
+ index=['1', '2', '3', '4'] # index dtype='object'
+ )
+
+ with self.assertLogs('moonstone.utils.df_merge', level='WARNING') as log:
+ merged_df = MergeDF(self.d1, d2, 'sex').merged_df
+ self.assertEqual(len(log.output), 2)
+ self.assertIn(
+ "WARNING:moonstone.utils.df_merge:Index types do not match: int64 and object.",
+ log.output
+ )
pd.testing.assert_frame_equal(merged_df, df_expected)
diff --git a/tests/utils/test_df_reindex.py b/tests/utils/test_df_reindex.py
index cc92c6a..54cdab4 100644
--- a/tests/utils/test_df_reindex.py
+++ b/tests/utils/test_df_reindex.py
@@ -1,5 +1,6 @@
from unittest import TestCase
+import numpy as np
import pandas as pd
from moonstone.utils.df_reindex import GenesToTaxonomy
@@ -33,17 +34,18 @@ def test_reindex_with_taxonomy(self):
'sample_1':
{
('Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales',
- 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners'): 23,
+ 'Enterococcaceae', 'Enterococcus', 'Enterococcus_faecium'): 15,
('Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales',
- 'Enterococcaceae', 'Enterococcus', 'Enterococcus_faecium'): 15
+ 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners'): 23
},
'sample_2':
{
('Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales',
- 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners'): 7,
+ 'Enterococcaceae', 'Enterococcus', 'Enterococcus_faecium'): 4,
('Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales',
- 'Enterococcaceae', 'Enterococcus', 'Enterococcus_faecium'): 4}
+ 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners'): 7
}
+ }
)
df_expected.index.set_names(["kingdom", "phylum", "class", "order", "family", "genus", "species"], inplace=True)
@@ -51,8 +53,7 @@ def test_reindex_with_taxonomy(self):
reindexed_df = reindexation_instance.reindexed_df
pd.testing.assert_frame_equal(reindexed_df, df_expected)
- def test_reindex_with_taxonomy_missing_infos(self):
- # for now, if there aren't any taxonomic information, the gene is dropped
+ def test_reindex_with_taxonomy_missing_infos_dropped(self):
df = pd.DataFrame(
[
[23, 7],
@@ -94,6 +95,100 @@ def test_reindex_with_taxonomy_missing_infos(self):
pd.testing.assert_frame_equal(reindexed_df, df_expected)
pd.testing.assert_index_equal(reindexation_instance.without_info_index, pd.Index(['gene_2'], dtype='object'))
+ def test_reindex_with_taxonomy_missing_infos_kept(self):
+ df = pd.DataFrame(
+ [
+ [23, 7],
+ [15, 4],
+ [0, 36],
+ ],
+ columns=['sample_1', 'sample_2'],
+ index=['gene_1', 'gene_2', 'gene_4'] # index dtype='object'
+ )
+ df_taxo = pd.DataFrame(
+ [
+ [147802,
+ 'k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; \
+f__Lactobacillaceae; g__Lactobacillus; s__iners'],
+ [1352,
+ 'k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; \
+f__Enterococcaceae; g__Enterococcus; s__faecium']
+ ],
+ columns=['tax_id', 'full_tax'],
+ index=['gene_1', 'gene_3'] # index dtype='object'
+ )
+ df_expected = pd.DataFrame.from_dict(
+ {
+ 'sample_1':
+ {
+ ('Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales',
+ 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners'): 23,
+ (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 'gene_2_species'): 15,
+ (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 'gene_4_species'): 0,
+ },
+ 'sample_2':
+ {
+ ('Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales',
+ 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners'): 7,
+ (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 'gene_2_species'): 4,
+ (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 'gene_4_species'): 36,
+ }
+ }
+ )
+ df_expected.index.set_names(["kingdom", "phylum", "class", "order", "family", "genus", "species"], inplace=True)
+
+ reindexation_instance = GenesToTaxonomy(df, df_taxo)
+ reindexed_df = reindexation_instance.reindex_with_taxonomy(na='keep')
+ pd.testing.assert_frame_equal(reindexed_df, df_expected)
+ pd.testing.assert_index_equal(
+ reindexation_instance.without_info_index,
+ pd.Index(['gene_2', 'gene_4'], dtype='object')
+ )
+
+ def test_reindex_with_taxonomy_missing_infos_summed(self):
+ df = pd.DataFrame(
+ [
+ [23, 7],
+ [15, 4],
+ [0, 36],
+ ],
+ columns=['sample_1', 'sample_2'],
+ index=['gene_1', 'gene_2', 'gene_4'] # index dtype='object'
+ )
+ df_taxo = pd.DataFrame(
+ [
+ [147802,
+ 'k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; \
+f__Lactobacillaceae; g__Lactobacillus; s__iners'],
+ [1352,
+ 'k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; \
+f__Enterococcaceae; g__Enterococcus; s__faecium']
+ ],
+ columns=['tax_id', 'full_tax'],
+ index=['gene_1', 'gene_3'] # index dtype='object'
+ )
+ df_expected = pd.DataFrame.from_dict(
+ {
+ 'sample_1':
+ {
+ ('Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales',
+ 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners'): 23,
+ (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan): 15,
+ },
+ 'sample_2':
+ {
+ ('Bacteria', 'Firmicutes', 'Bacilli', 'Lactobacillales',
+ 'Lactobacillaceae', 'Lactobacillus', 'Lactobacillus_iners'): 7,
+ (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan): 40,
+ }
+ }
+ )
+ df_expected.index.set_names(["kingdom", "phylum", "class", "order", "family", "genus", "species"], inplace=True)
+
+ reindexation_instance = GenesToTaxonomy(df, df_taxo)
+ reindexed_df = reindexation_instance.reindex_with_taxonomy(na='sum')
+ pd.testing.assert_frame_equal(reindexed_df, df_expected)
+
def test_reindex_with_taxonomy_summing(self):
df = pd.DataFrame(
[