Merge pull request #99 from motleystate/minor_debugging

Updating version of python and some python libraries + minor error debugging, new features
motleystate · Oct 12, 2023 · 5946124 · 5946124
2 parents 56db2cf + c513aaa
commit 5946124
Show file tree

Hide file tree

Showing 24 changed files with 360 additions and 95 deletions.
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7]
+        python-version: [3.9]
 
     steps:
     - uses: actions/checkout@v2
@@ -19,9 +19,9 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip setuptools wheel
         pip install pytest pytest-cov
-        pip install numpy==1.18.1
+        pip install numpy==1.24.3
         pip install .
         pip install odfpy     # optional dependencies
         pip install openpyxl  # idem

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.7", "3.8"]
+        python-version: ["3.8", "3.9"]
 
     steps:
     - uses: actions/checkout@v2
@@ -22,9 +22,9 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip setuptools wheel
         pip install flake8 pytest
-        pip install numpy==1.18.1
+        pip install numpy==1.24.3
         pip install .
         pip install odfpy     # optional dependencies
         pip install openpyxl  # idem

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -17,7 +17,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: '3.7'
+        python-version: '3.9'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/moonstone/analysis/differential_analysis.py b/moonstone/analysis/differential_analysis.py
@@ -1,7 +1,6 @@
 import logging
 
 import pandas as pd
-import numpy as np
 import scipy.stats as st
 from statsmodels.stats.multitest import multipletests
 
@@ -53,8 +52,8 @@ def test_dichotomic_features(self, feature, test_to_use):
         cat1 = self.full_table[self.full_table[feature] == self.full_table[feature][0]]
         cat2 = self.full_table[self.full_table[feature] != self.full_table[feature][0]]
         for family in range(self.number_columns_to_skip, self.full_table.shape[1]):
-            test = self.tests_functions_used[test_to_use](cat1[self.full_table.columns[family]],
-                                                          cat2[self.full_table.columns[family]])
+            test = self.tests_functions_used[test_to_use](cat1[self.full_table.columns[family]].astype(float),
+                                                          cat2[self.full_table.columns[family]].astype(float))
             features.append(feature)
             taxons.append(self.full_table.columns[family])
             static_value.append(round(test[0], 6))
@@ -79,7 +78,8 @@ def test_multiple_features(self, feature, test_to_use):
             list_ofgroups = []
             for variable in variable_dic:
                 list_ofgroups.append(variable_dic[variable][self.full_table.columns[family]])
-            test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups))
+            #test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups))
+            test = self.tests_functions_used[test_to_use](*list_ofgroups)  # works for kruskal and one way anova
             features.append(feature)
             taxons.append(self.full_table.columns[family])
             static_values.append(round(test[0], 6))
@@ -114,5 +114,5 @@ def differential_analysis_by_feature(self, features, type_of_features, test_to_u
         for feature in features:
             test_result = getattr(self, f"test_{type_of_features}", self.test_default)(feature, test_to_use)
             test_result['corrected_p-value'] = self.corrected_p_values(test_result['p-value'], correction_method_used)
-            final_table = final_table.append(test_result)
+            final_table = pd.concat([final_table, test_result])
         return final_table
diff --git a/moonstone/analysis/diversity/base.py b/moonstone/analysis/diversity/base.py
@@ -194,8 +194,8 @@ def _run_statistical_test_groups(
 
             corrected_pval.index = pval.dropna().index   # postulate that the order hasn't changed
             if pval[pval.isnull()].size > 0:
-                corrected_pval = corrected_pval.append(pval[pval.isnull()])
-
+                # corrected_pval = corrected_pval.append(pval[pval.isnull()])
+                corrected_pval = pd.concat([corrected_pval, pval[pval.isnull()]])
             # remodelling of p-values output
             corrected_pval = self._structure_remodelling(corrected_pval, structure=structure_pval, sym=sym)
             return corrected_pval
@@ -240,18 +240,21 @@ def _compute_pval_inside_subgroups(
         self, diversity_index_dataframe: pd.DataFrame, group_col: str, final_group_col: str,
         stats_test: str, correction_method: str, structure_pval: str, sym: bool
     ):
-        pval = pd.Series([])
+        pval = pd.Series([], dtype='float64')
         for g in diversity_index_dataframe[group_col].dropna().unique():
             df_gp = diversity_index_dataframe[diversity_index_dataframe[group_col] == g]
             if df_gp.shape[0] < 2:
                 logger.warning(
                     f"Less than 2 samples in dataframe group {g} in data. P-val can't be computed."
                 )
             else:
-                pval = pval.append(self._run_statistical_test_groups(
-                    df_gp, final_group_col, stats_test,
-                    correction_method, structure_pval, sym
-                ))
+                pval = pd.concat([
+                    pval,
+                    self._run_statistical_test_groups(
+                        df_gp, final_group_col, stats_test,
+                        correction_method, structure_pval, sym
+                    )
+                ])
         pval.index = pd.MultiIndex.from_tuples(pval.index, names=('Group1', 'Group2'))
         return pval
 
@@ -317,12 +320,13 @@ def analyse_groups(
                     df, group_col, final_group_col, stats_test, correction_method, structure_pval, sym
                 )
                 if pval_to_compute == "same group_col or group_col2 values":
-                    pval = pval.append(
+                    pval = pd.concat([
+                        pval,
                         self._compute_pval_inside_subgroups(
                             df, group_col2, final_group_col,
                             stats_test, correction_method, structure_pval, sym
                         )
-                    )
+                    ])
 
         else:
             df = self._get_grouped_df(filtered_metadata_df[group_col])
@@ -359,7 +363,7 @@ def analyse_groups(
 
         # 'data' different from 'diversity indexes' in the fact that it has been filtered on metadata, meaning that
         # samples without metadata for group_col (or group_col2) have been dropped
-        return{**{'data': df}, **self.report_data['analyse_groups']}
+        return {**{'data': df}, **self.report_data['analyse_groups']}
 
     def generate_report_data(self) -> dict:
         """

diff --git a/moonstone/analysis/statistical_test.py b/moonstone/analysis/statistical_test.py
@@ -14,6 +14,15 @@
 def _preprocess_groups_comparison(
     series: pd.Series, group_series: pd.Series, stat_test: str
 ):
+    # If samples in group_series/metadata but not in series/count_dataframe
+    # then we need to remove them from the group_series/metadata
+    # to not get an error like "None of [Index(['sample7'], dtype='object')] are in the [index]"
+    group_series_index_to_keep = group_series.index.intersection(series.index)
+    if len(group_series_index_to_keep) != len(group_series.index):
+        logger.info(
+            "Some index values in group_series aren't found in the series. Dropping those rows."
+        )
+        group_series = group_series.loc[group_series_index_to_keep]
     groups = list(group_series.unique())
     groups.sort()
 

diff --git a/moonstone/parsers/base.py b/moonstone/parsers/base.py
@@ -52,8 +52,10 @@ def _load_data(self) -> pd.DataFrame:
             "xlsb": "pyxlsb"                            # Binary Excel files
             }
         if ext in ext_engine.keys():
+            if self.header == "infer":
+                self.header = 0  # "infer" not accepted with read_excel anymore
             return pd.read_excel(
-                self.file_path, sep=self.sep, header=self.header, **self.parsing_options,
+                self.file_path, header=self.header, **self.parsing_options,
                 engine=ext_engine[ext]
             )
         return pd.read_csv(

diff --git a/moonstone/parsers/counts/taxonomy/metaphlan.py b/moonstone/parsers/counts/taxonomy/metaphlan.py
@@ -1,29 +1,48 @@
-from pandas import DataFrame
+import logging
+
+import pandas as pd
 
 from moonstone.parsers.counts.taxonomy.base import BaseTaxonomyCountsParser
 
+logger = logging.getLogger(__name__)
+
 
 class BaseMetaphlanParser(BaseTaxonomyCountsParser):
 
     def __init__(self, *args, analysis_type: str = 'rel_ab', **kwargs):
         """
         Args:
             analysis_type: output type of Metaphlan3 (see ``-t`` option of metaphlan3)
+              { 'rel_ab', 'rel_ab_w_read_stats', 'reads_map', 'clade_profiles', 'marker_ab_table', 'marker_counts',
+              'marker_pres_table', 'clade_specific_strain_tracker' }
         """
-        self.analysis_type = analysis_type
+        self.analysis_type = self._valid_analysis_type(analysis_type)
         super().__init__(*args, **kwargs)
 
-    def rows_differences(self, dataframe1, dataframe2) -> DataFrame:
+    def _valid_analysis_type(self, analysis_type):
+        choices = [
+            "rel_ab", "rel_ab_w_read_stats", "reads_map", "clade_profiles",
+            "marker_ab_table", "marker_counts", "marker_pres_table", "clade_specific_strain_tracker"
+        ]
+        if analysis_type not in choices:
+            logger.warning("analysis_type='%s' not valid, set to default ('rel_ab').", analysis_type)
+            analysis_type = "rel_ab"
+        return analysis_type
+
+    def rows_differences(self, dataframe1, dataframe2) -> pd.DataFrame:
         rows_diff = dataframe1 - dataframe2
         rows_diff[rows_diff.isnull()] = dataframe1
         if self.analysis_type == 'rel_ab':
             rows_diff[rows_diff < 0.0001] = 0
+            # if difference between sum of organism of rank r (ex: sum of species of genus X)
+            # and value of rank r+1 (ex:genus X) is so small,
+            # we assume that it's due to python addition approximation with decimal
         else:
             rows_diff[rows_diff < 0] = 0
         rows_diff = rows_diff.loc[rows_diff.sum(axis=1)[rows_diff.sum(axis=1) != 0].index]
         return rows_diff
 
-    def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, rank) -> DataFrame:
+    def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, rank) -> pd.DataFrame:
         df_rank = whole_df[whole_df.index.map(lambda x: len(x.split('|'))) == rank]
 
         # transformation lower_level to rank (level)
@@ -32,7 +51,22 @@ def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, ran
         df_rank_computed = df_rank_computed.groupby(df_rank_computed.index).sum()             # grouping by rank (level)
         return self.rows_differences(df_rank, df_rank_computed)
 
-    def remove_duplicates(self, df) -> DataFrame:
+    def remove_duplicates(self, df) -> pd.DataFrame:
+        """
+        Metaphlan3 results are by level therefore we need to remove the duplicated informations
+        Example:
+        We have:
+            ...|g_GenusA    50.0
+            ...|g_GenusA|s_Species1 30.0
+            ...|g_GenusB    50.0
+            ...|g_GenusB|s_Species2 50.0
+            Sum = 180.0 =/= 100.0 (while it's relative abundance -> but same problem with other analysis type)
+        We want:
+            ...|g_GenusA|s_GenusA (genus)   20.0    # unspecified species
+            ...|g_GenusA|s_Species1 30.0
+            ...|g_GenusB|s_Species2 50.0
+            Sum = 100.0
+        """
         df = df.set_index(self.taxa_column)
 
         # dataframe at rank level
@@ -56,8 +90,8 @@ def remove_duplicates(self, df) -> DataFrame:
             rank -= 1
             rows_diff = self.compare_difference_between_two_levels(df, new_df, rank)
             if rows_diff.size != 0:
-                new_df = new_df.append(rows_diff)              # add missing rows to the dataframe of the lower level
-
+                # new_df = new_df.append(rows_diff)              # add missing rows to the dataframe of the lower level
+                new_df = pd.concat([new_df, rows_diff])        # add missing rows to the dataframe of the lower level
             # verification that everything is defined up to the lower_level
             samples_with_incomp_lowerlevel = new_df.sum()[new_df.sum() < total]
 
@@ -72,7 +106,7 @@ class Metaphlan2Parser(BaseMetaphlanParser):
 
     taxa_column = 'ID'
 
-    def _load_data(self) -> DataFrame:
+    def _load_data(self) -> pd.DataFrame:
         df = super()._load_data()
         df = self.remove_duplicates(df)
         df = self.split_taxa_fill_none(df, sep="|")
@@ -88,17 +122,37 @@ class Metaphlan3Parser(BaseMetaphlanParser):
     taxa_column = 'clade_name'
     NCBI_tax_column = 'NCBI_tax_id'
 
-    def __init__(self, *args, analysis_type: str = 'rel_ab', **kwargs):
+    def __init__(self, *args, analysis_type: str = 'rel_ab', keep_NCBI_tax_col: bool = False, **kwargs):
         """
         Args:
             analysis_type: output type of Metaphlan3 (see ``-t`` option of metaphlan3)
+              { 'rel_ab', 'rel_ab_w_read_stats', 'reads_map', 'clade_profiles', 'marker_ab_table', 'marker_counts',
+              'marker_pres_table', 'clade_specific_strain_tracker' }
+            keep_NCBI_tax_col: set to True if you want the NCBI tax column in the returned dataframe.
         """
+        self.keep_NCBI_tax_col = keep_NCBI_tax_col
         super().__init__(*args, analysis_type=analysis_type, parsing_options={'skiprows': 1}, **kwargs)
 
-    def _load_data(self) -> DataFrame:
+    def _load_data(self) -> pd.DataFrame:
         df = super()._load_data()
-        df = df.drop(self.NCBI_tax_column, axis=1)
+
+        # if number of taxonomical_names is inferior to the default,
+        if len(self.taxonomical_names) < len(BaseTaxonomyCountsParser.taxonomical_names):
+            # we need to restrict the rows considered to only the rows that recount taxonomical level inside the range
+            # wanted.
+            # Or error "ValueError: Error : expecting a integer inferior or equal to the number of taxonomical_names."
+            # will be raised
+            df = df[df["NCBI_tax_id"].map(lambda x: len(x.split("|"))) <= len(self.taxonomical_names)]
+        if self.keep_NCBI_tax_col:
+            tmp = df[[self.NCBI_tax_column, self.taxa_column]]
+
+        df = df.drop(self.NCBI_tax_column, axis=1)  # NCBI_tax_column needs to be dropped because sum
         df = self.remove_duplicates(df)
+
+        if self.keep_NCBI_tax_col:
+            tmp[self.NCBI_tax_column] = tmp[self.NCBI_tax_column].map(lambda x: x.split("|")[-1])
+            df = df.merge(tmp)
+
         df = self.split_taxa_fill_none(df, sep="|")
         df = df.set_index(self.taxonomical_names[:self.rank_level])
         return df
diff --git a/moonstone/plot/counts.py b/moonstone/plot/counts.py
@@ -456,7 +456,8 @@ def _plot_most_what_taxa_boxplot_or_violin(
             tmp = relab_df_taxa[i].reset_index()
             tmp.index = nb * [i]
             tmp.columns = ["species", "relative abundance"]
-            relab_df_taxa2 = relab_df_taxa2.append(tmp)
+            # relab_df_taxa2 = relab_df_taxa2.append(tmp)
+            relab_df_taxa2 = pd.concat([relab_df_taxa2, tmp])
         relab_df_taxa2.species = relab_df_taxa2.species.apply(self._italicize_taxa_name)
         groups = [self._italicize_taxa_name(name) for name in groups]
 
@@ -747,7 +748,8 @@ def plot_sample_composition_most_abundant_taxa(
         # Make graph
         graph = MatrixBarGraph(data_df)
         # Plotting options
-        title = f"{taxa_level.capitalize()} composition for the top {taxa_number} most abundant species across samples"
+        title = f"{taxa_level.capitalize()} composition for the top {taxa_number} most abundant {taxa_level} across \
+samples"
         if prevalence_threshold is not None:
             title += f" (present in at least {prevalence_threshold}% of samples)"
 

diff --git a/moonstone/plot/graphs/base.py b/moonstone/plot/graphs/base.py
@@ -236,7 +236,7 @@ def plot_one_graph(
             if groups:
                 filtered_df = self.data[self.data[group_col].isin(groups)]
                 filtered_df[group_col] = filtered_df[group_col].astype("category")
-                filtered_df[group_col].cat.set_categories(groups, inplace=True)
+                filtered_df[group_col].cat = filtered_df[group_col].cat.set_categories(groups)
                 filtered_df = filtered_df.sort_values([group_col])
             else:
                 filtered_df = copy.deepcopy(self.data)
@@ -247,7 +247,7 @@ def plot_one_graph(
                     fig,
                     filtered_df2[group_col],
                     filtered_df2[data_col],
-                    names[group],
+                    str(names[group]),
                     filtered_df.index,
                     self._get_group_color(group, colors),
                     orientation,
@@ -265,7 +265,7 @@ def plot_one_graph(
                     fig,
                     filtered_df[group_col],
                     filtered_df[data_col],
-                    names[group],
+                    str(names[group]),
                     filtered_df.index,
                     self._get_group_color(group, colors),
                     orientation,

diff --git a/moonstone/utils/df_merge.py b/moonstone/utils/df_merge.py
@@ -1,6 +1,5 @@
 import logging
 import pandas as pd
-import numpy as np
 
 logger = logging.getLogger(__name__)
 
@@ -26,10 +25,15 @@ def merge(self):
         logger.info('Merge function called to merge count data and metadata.')
         logger.info(f'Variable {self.variable} from metadata file will be merged with counts.')
 
-        if not isinstance(self.dc.index, type(self.dm.index)):
-            logger.warning(f'Index types do not match: {type(self.dc.index)} and {type(self.dm.index)}.')
-            self.dc.set_index(np.int64(np.array(self.dc.index)), inplace=True)
-            logger.info(f' Indexes reset. Count Index={type(self.dc.index)}, Metadata Index={type(self.dm.index)}')
+        # if not isinstance(self.dc.index, type(self.dm.index)):
+        if self.dc.index.dtype != self.dm.index.dtype:
+            # logger.warning(f'Index types do not match: {type(self.dc.index)} and {type(self.dm.index)}.')
+            # self.dc = self.dc.set_index(np.int64(np.array(self.dc.index)))
+            # logger.info(f' Indexes reset. Count Index={type(self.dc.index)}, Metadata Index={type(self.dm.index)}')
+            logger.warning(f'Index types do not match: {self.dc.index.dtype} and {self.dm.index.dtype}.')
+            self.dc.index = self.dc.index.astype(str)
+            self.dm.index = self.dm.index.astype(str)
+            logger.warning('Both Count and Metadata Indexes set as string')
 
         df = pd.merge(self.dm[self.variable], self.dc, left_index=True, right_index=True)
         logger.info('Merge function completed. Returning merged data frame.')