Skip to content

Commit

Permalink
Merge pull request #99 from motleystate/minor_debugging
Browse files Browse the repository at this point in the history
Updating version of python and some python libraries + minor error debugging, new features
  • Loading branch information
AgnesBaud authored Oct 12, 2023
2 parents 56db2cf + c513aaa commit 5946124
Show file tree
Hide file tree
Showing 24 changed files with 360 additions and 95 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7]
python-version: [3.9]

steps:
- uses: actions/checkout@v2
Expand All @@ -19,9 +19,9 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade pip setuptools wheel
pip install pytest pytest-cov
pip install numpy==1.18.1
pip install numpy==1.24.3
pip install .
pip install odfpy # optional dependencies
pip install openpyxl # idem
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.7", "3.8"]
python-version: ["3.8", "3.9"]

steps:
- uses: actions/checkout@v2
Expand All @@ -22,9 +22,9 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade pip setuptools wheel
pip install flake8 pytest
pip install numpy==1.18.1
pip install numpy==1.24.3
pip install .
pip install odfpy # optional dependencies
pip install openpyxl # idem
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.7'
python-version: '3.9'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
10 changes: 5 additions & 5 deletions moonstone/analysis/differential_analysis.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging

import pandas as pd
import numpy as np
import scipy.stats as st
from statsmodels.stats.multitest import multipletests

Expand Down Expand Up @@ -53,8 +52,8 @@ def test_dichotomic_features(self, feature, test_to_use):
cat1 = self.full_table[self.full_table[feature] == self.full_table[feature][0]]
cat2 = self.full_table[self.full_table[feature] != self.full_table[feature][0]]
for family in range(self.number_columns_to_skip, self.full_table.shape[1]):
test = self.tests_functions_used[test_to_use](cat1[self.full_table.columns[family]],
cat2[self.full_table.columns[family]])
test = self.tests_functions_used[test_to_use](cat1[self.full_table.columns[family]].astype(float),
cat2[self.full_table.columns[family]].astype(float))
features.append(feature)
taxons.append(self.full_table.columns[family])
static_value.append(round(test[0], 6))
Expand All @@ -79,7 +78,8 @@ def test_multiple_features(self, feature, test_to_use):
list_ofgroups = []
for variable in variable_dic:
list_ofgroups.append(variable_dic[variable][self.full_table.columns[family]])
test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups))
#test = self.tests_functions_used[test_to_use](*np.asarray(list_ofgroups))
test = self.tests_functions_used[test_to_use](*list_ofgroups) # works for kruskal and one way anova
features.append(feature)
taxons.append(self.full_table.columns[family])
static_values.append(round(test[0], 6))
Expand Down Expand Up @@ -114,5 +114,5 @@ def differential_analysis_by_feature(self, features, type_of_features, test_to_u
for feature in features:
test_result = getattr(self, f"test_{type_of_features}", self.test_default)(feature, test_to_use)
test_result['corrected_p-value'] = self.corrected_p_values(test_result['p-value'], correction_method_used)
final_table = final_table.append(test_result)
final_table = pd.concat([final_table, test_result])
return final_table
24 changes: 14 additions & 10 deletions moonstone/analysis/diversity/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,8 @@ def _run_statistical_test_groups(

corrected_pval.index = pval.dropna().index # postulate that the order hasn't changed
if pval[pval.isnull()].size > 0:
corrected_pval = corrected_pval.append(pval[pval.isnull()])

# corrected_pval = corrected_pval.append(pval[pval.isnull()])
corrected_pval = pd.concat([corrected_pval, pval[pval.isnull()]])
# remodelling of p-values output
corrected_pval = self._structure_remodelling(corrected_pval, structure=structure_pval, sym=sym)
return corrected_pval
Expand Down Expand Up @@ -240,18 +240,21 @@ def _compute_pval_inside_subgroups(
self, diversity_index_dataframe: pd.DataFrame, group_col: str, final_group_col: str,
stats_test: str, correction_method: str, structure_pval: str, sym: bool
):
pval = pd.Series([])
pval = pd.Series([], dtype='float64')
for g in diversity_index_dataframe[group_col].dropna().unique():
df_gp = diversity_index_dataframe[diversity_index_dataframe[group_col] == g]
if df_gp.shape[0] < 2:
logger.warning(
f"Less than 2 samples in dataframe group {g} in data. P-val can't be computed."
)
else:
pval = pval.append(self._run_statistical_test_groups(
df_gp, final_group_col, stats_test,
correction_method, structure_pval, sym
))
pval = pd.concat([
pval,
self._run_statistical_test_groups(
df_gp, final_group_col, stats_test,
correction_method, structure_pval, sym
)
])
pval.index = pd.MultiIndex.from_tuples(pval.index, names=('Group1', 'Group2'))
return pval

Expand Down Expand Up @@ -317,12 +320,13 @@ def analyse_groups(
df, group_col, final_group_col, stats_test, correction_method, structure_pval, sym
)
if pval_to_compute == "same group_col or group_col2 values":
pval = pval.append(
pval = pd.concat([
pval,
self._compute_pval_inside_subgroups(
df, group_col2, final_group_col,
stats_test, correction_method, structure_pval, sym
)
)
])

else:
df = self._get_grouped_df(filtered_metadata_df[group_col])
Expand Down Expand Up @@ -359,7 +363,7 @@ def analyse_groups(

# 'data' different from 'diversity indexes' in the fact that it has been filtered on metadata, meaning that
# samples without metadata for group_col (or group_col2) have been dropped
return{**{'data': df}, **self.report_data['analyse_groups']}
return {**{'data': df}, **self.report_data['analyse_groups']}

def generate_report_data(self) -> dict:
"""
Expand Down
9 changes: 9 additions & 0 deletions moonstone/analysis/statistical_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@
def _preprocess_groups_comparison(
series: pd.Series, group_series: pd.Series, stat_test: str
):
# If samples in group_series/metadata but not in series/count_dataframe
# then we need to remove them from the group_series/metadata
# to not get an error like "None of [Index(['sample7'], dtype='object')] are in the [index]"
group_series_index_to_keep = group_series.index.intersection(series.index)
if len(group_series_index_to_keep) != len(group_series.index):
logger.info(
"Some index values in group_series aren't found in the series. Dropping those rows."
)
group_series = group_series.loc[group_series_index_to_keep]
groups = list(group_series.unique())
groups.sort()

Expand Down
4 changes: 3 additions & 1 deletion moonstone/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ def _load_data(self) -> pd.DataFrame:
"xlsb": "pyxlsb" # Binary Excel files
}
if ext in ext_engine.keys():
if self.header == "infer":
self.header = 0 # "infer" not accepted with read_excel anymore
return pd.read_excel(
self.file_path, sep=self.sep, header=self.header, **self.parsing_options,
self.file_path, header=self.header, **self.parsing_options,
engine=ext_engine[ext]
)
return pd.read_csv(
Expand Down
76 changes: 65 additions & 11 deletions moonstone/parsers/counts/taxonomy/metaphlan.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,48 @@
from pandas import DataFrame
import logging

import pandas as pd

from moonstone.parsers.counts.taxonomy.base import BaseTaxonomyCountsParser

logger = logging.getLogger(__name__)


class BaseMetaphlanParser(BaseTaxonomyCountsParser):

def __init__(self, *args, analysis_type: str = 'rel_ab', **kwargs):
"""
Args:
analysis_type: output type of Metaphlan3 (see ``-t`` option of metaphlan3)
{ 'rel_ab', 'rel_ab_w_read_stats', 'reads_map', 'clade_profiles', 'marker_ab_table', 'marker_counts',
'marker_pres_table', 'clade_specific_strain_tracker' }
"""
self.analysis_type = analysis_type
self.analysis_type = self._valid_analysis_type(analysis_type)
super().__init__(*args, **kwargs)

def rows_differences(self, dataframe1, dataframe2) -> DataFrame:
def _valid_analysis_type(self, analysis_type):
choices = [
"rel_ab", "rel_ab_w_read_stats", "reads_map", "clade_profiles",
"marker_ab_table", "marker_counts", "marker_pres_table", "clade_specific_strain_tracker"
]
if analysis_type not in choices:
logger.warning("analysis_type='%s' not valid, set to default ('rel_ab').", analysis_type)
analysis_type = "rel_ab"
return analysis_type

def rows_differences(self, dataframe1, dataframe2) -> pd.DataFrame:
rows_diff = dataframe1 - dataframe2
rows_diff[rows_diff.isnull()] = dataframe1
if self.analysis_type == 'rel_ab':
rows_diff[rows_diff < 0.0001] = 0
# if difference between sum of organism of rank r (ex: sum of species of genus X)
# and value of rank r+1 (ex:genus X) is so small,
# we assume that it's due to python addition approximation with decimal
else:
rows_diff[rows_diff < 0] = 0
rows_diff = rows_diff.loc[rows_diff.sum(axis=1)[rows_diff.sum(axis=1) != 0].index]
return rows_diff

def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, rank) -> DataFrame:
def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, rank) -> pd.DataFrame:
df_rank = whole_df[whole_df.index.map(lambda x: len(x.split('|'))) == rank]

# transformation lower_level to rank (level)
Expand All @@ -32,7 +51,22 @@ def compare_difference_between_two_levels(self, whole_df, df_at_lower_level, ran
df_rank_computed = df_rank_computed.groupby(df_rank_computed.index).sum() # grouping by rank (level)
return self.rows_differences(df_rank, df_rank_computed)

def remove_duplicates(self, df) -> DataFrame:
def remove_duplicates(self, df) -> pd.DataFrame:
"""
Metaphlan3 results are by level therefore we need to remove the duplicated informations
Example:
We have:
...|g_GenusA 50.0
...|g_GenusA|s_Species1 30.0
...|g_GenusB 50.0
...|g_GenusB|s_Species2 50.0
Sum = 180.0 =/= 100.0 (while it's relative abundance -> but same problem with other analysis type)
We want:
...|g_GenusA|s_GenusA (genus) 20.0 # unspecified species
...|g_GenusA|s_Species1 30.0
...|g_GenusB|s_Species2 50.0
Sum = 100.0
"""
df = df.set_index(self.taxa_column)

# dataframe at rank level
Expand All @@ -56,8 +90,8 @@ def remove_duplicates(self, df) -> DataFrame:
rank -= 1
rows_diff = self.compare_difference_between_two_levels(df, new_df, rank)
if rows_diff.size != 0:
new_df = new_df.append(rows_diff) # add missing rows to the dataframe of the lower level

# new_df = new_df.append(rows_diff) # add missing rows to the dataframe of the lower level
new_df = pd.concat([new_df, rows_diff]) # add missing rows to the dataframe of the lower level
# verification that everything is defined up to the lower_level
samples_with_incomp_lowerlevel = new_df.sum()[new_df.sum() < total]

Expand All @@ -72,7 +106,7 @@ class Metaphlan2Parser(BaseMetaphlanParser):

taxa_column = 'ID'

def _load_data(self) -> DataFrame:
def _load_data(self) -> pd.DataFrame:
df = super()._load_data()
df = self.remove_duplicates(df)
df = self.split_taxa_fill_none(df, sep="|")
Expand All @@ -88,17 +122,37 @@ class Metaphlan3Parser(BaseMetaphlanParser):
taxa_column = 'clade_name'
NCBI_tax_column = 'NCBI_tax_id'

def __init__(self, *args, analysis_type: str = 'rel_ab', **kwargs):
def __init__(self, *args, analysis_type: str = 'rel_ab', keep_NCBI_tax_col: bool = False, **kwargs):
"""
Args:
analysis_type: output type of Metaphlan3 (see ``-t`` option of metaphlan3)
{ 'rel_ab', 'rel_ab_w_read_stats', 'reads_map', 'clade_profiles', 'marker_ab_table', 'marker_counts',
'marker_pres_table', 'clade_specific_strain_tracker' }
keep_NCBI_tax_col: set to True if you want the NCBI tax column in the returned dataframe.
"""
self.keep_NCBI_tax_col = keep_NCBI_tax_col
super().__init__(*args, analysis_type=analysis_type, parsing_options={'skiprows': 1}, **kwargs)

def _load_data(self) -> DataFrame:
def _load_data(self) -> pd.DataFrame:
df = super()._load_data()
df = df.drop(self.NCBI_tax_column, axis=1)

# if number of taxonomical_names is inferior to the default,
if len(self.taxonomical_names) < len(BaseTaxonomyCountsParser.taxonomical_names):
# we need to restrict the rows considered to only the rows that recount taxonomical level inside the range
# wanted.
# Or error "ValueError: Error : expecting a integer inferior or equal to the number of taxonomical_names."
# will be raised
df = df[df["NCBI_tax_id"].map(lambda x: len(x.split("|"))) <= len(self.taxonomical_names)]
if self.keep_NCBI_tax_col:
tmp = df[[self.NCBI_tax_column, self.taxa_column]]

df = df.drop(self.NCBI_tax_column, axis=1) # NCBI_tax_column needs to be dropped because sum
df = self.remove_duplicates(df)

if self.keep_NCBI_tax_col:
tmp[self.NCBI_tax_column] = tmp[self.NCBI_tax_column].map(lambda x: x.split("|")[-1])
df = df.merge(tmp)

df = self.split_taxa_fill_none(df, sep="|")
df = df.set_index(self.taxonomical_names[:self.rank_level])
return df
6 changes: 4 additions & 2 deletions moonstone/plot/counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,8 @@ def _plot_most_what_taxa_boxplot_or_violin(
tmp = relab_df_taxa[i].reset_index()
tmp.index = nb * [i]
tmp.columns = ["species", "relative abundance"]
relab_df_taxa2 = relab_df_taxa2.append(tmp)
# relab_df_taxa2 = relab_df_taxa2.append(tmp)
relab_df_taxa2 = pd.concat([relab_df_taxa2, tmp])
relab_df_taxa2.species = relab_df_taxa2.species.apply(self._italicize_taxa_name)
groups = [self._italicize_taxa_name(name) for name in groups]

Expand Down Expand Up @@ -747,7 +748,8 @@ def plot_sample_composition_most_abundant_taxa(
# Make graph
graph = MatrixBarGraph(data_df)
# Plotting options
title = f"{taxa_level.capitalize()} composition for the top {taxa_number} most abundant species across samples"
title = f"{taxa_level.capitalize()} composition for the top {taxa_number} most abundant {taxa_level} across \
samples"
if prevalence_threshold is not None:
title += f" (present in at least {prevalence_threshold}% of samples)"

Expand Down
6 changes: 3 additions & 3 deletions moonstone/plot/graphs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def plot_one_graph(
if groups:
filtered_df = self.data[self.data[group_col].isin(groups)]
filtered_df[group_col] = filtered_df[group_col].astype("category")
filtered_df[group_col].cat.set_categories(groups, inplace=True)
filtered_df[group_col].cat = filtered_df[group_col].cat.set_categories(groups)
filtered_df = filtered_df.sort_values([group_col])
else:
filtered_df = copy.deepcopy(self.data)
Expand All @@ -247,7 +247,7 @@ def plot_one_graph(
fig,
filtered_df2[group_col],
filtered_df2[data_col],
names[group],
str(names[group]),
filtered_df.index,
self._get_group_color(group, colors),
orientation,
Expand All @@ -265,7 +265,7 @@ def plot_one_graph(
fig,
filtered_df[group_col],
filtered_df[data_col],
names[group],
str(names[group]),
filtered_df.index,
self._get_group_color(group, colors),
orientation,
Expand Down
14 changes: 9 additions & 5 deletions moonstone/utils/df_merge.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import pandas as pd
import numpy as np

logger = logging.getLogger(__name__)

Expand All @@ -26,10 +25,15 @@ def merge(self):
logger.info('Merge function called to merge count data and metadata.')
logger.info(f'Variable {self.variable} from metadata file will be merged with counts.')

if not isinstance(self.dc.index, type(self.dm.index)):
logger.warning(f'Index types do not match: {type(self.dc.index)} and {type(self.dm.index)}.')
self.dc.set_index(np.int64(np.array(self.dc.index)), inplace=True)
logger.info(f' Indexes reset. Count Index={type(self.dc.index)}, Metadata Index={type(self.dm.index)}')
# if not isinstance(self.dc.index, type(self.dm.index)):
if self.dc.index.dtype != self.dm.index.dtype:
# logger.warning(f'Index types do not match: {type(self.dc.index)} and {type(self.dm.index)}.')
# self.dc = self.dc.set_index(np.int64(np.array(self.dc.index)))
# logger.info(f' Indexes reset. Count Index={type(self.dc.index)}, Metadata Index={type(self.dm.index)}')
logger.warning(f'Index types do not match: {self.dc.index.dtype} and {self.dm.index.dtype}.')
self.dc.index = self.dc.index.astype(str)
self.dm.index = self.dm.index.astype(str)
logger.warning('Both Count and Metadata Indexes set as string')

df = pd.merge(self.dm[self.variable], self.dc, left_index=True, right_index=True)
logger.info('Merge function completed. Returning merged data frame.')
Expand Down
Loading

0 comments on commit 5946124

Please sign in to comment.