diff --git a/alphastats/__init__.py b/alphastats/__init__.py index f8025997..1d001088 100644 --- a/alphastats/__init__.py +++ b/alphastats/__init__.py @@ -43,8 +43,8 @@ import alphastats.gui # noqa: F401 from .cli import * # noqa: F403 -from .loader.AlphaPeptLoader import * # noqa: F403 -from .loader.DIANNLoader import * # noqa: F403 -from .loader.FragPipeLoader import * # noqa: F403 -from .loader.MaxQuantLoader import * # noqa: F403 -from .loader.SpectronautLoader import * # noqa: F403 +from .loader.alphapept_loader import * # noqa: F403 +from .loader.diann_loader import * # noqa: F403 +from .loader.fragpipe_loader import * # noqa: F403 +from .loader.maxquant_loader import * # noqa: F403 +from .loader.spectronaut_loader import * # noqa: F403 diff --git a/alphastats/dataset/__init__.py b/alphastats/dataset/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/alphastats/DataSet.py b/alphastats/dataset/dataset.py similarity index 96% rename from alphastats/DataSet.py rename to alphastats/dataset/dataset.py index e58d66ec..89e06487 100644 --- a/alphastats/DataSet.py +++ b/alphastats/dataset/dataset.py @@ -4,20 +4,24 @@ import plotly import scipy -from alphastats.dataset_factory import DataSetFactory -from alphastats.dataset_harmonizer import DataHarmonizer -from alphastats.DataSet_Plot import Plot -from alphastats.DataSet_Preprocess import Preprocess -from alphastats.DataSet_Statistics import Statistics -from alphastats.keys import Cols -from alphastats.loader.BaseLoader import BaseLoader -from alphastats.plots.ClusterMap import ClusterMap -from alphastats.plots.DimensionalityReduction import DimensionalityReduction -from alphastats.plots.IntensityPlot import IntensityPlot -from alphastats.plots.SampleHistogram import SampleHistogram -from alphastats.plots.VolcanoPlot import VolcanoPlot +from alphastats.dataset.factory import DataSetFactory +from alphastats.dataset.harmonizer import DataHarmonizer +from alphastats.dataset.keys import Cols +from alphastats.dataset.plotting import Plot +from alphastats.dataset.preprocessing import Preprocess +from alphastats.dataset.statistics import Statistics +from alphastats.dataset.utils import ( + LoaderError, + check_for_missing_values, + ignore_warning, +) +from alphastats.loader.base_loader import BaseLoader +from alphastats.plots.clustermap import ClusterMap +from alphastats.plots.dimensionality_reduction import DimensionalityReduction +from alphastats.plots.intensity_plot import IntensityPlot +from alphastats.plots.sample_histogram import SampleHistogram +from alphastats.plots.volcano_plot import VolcanoPlot from alphastats.statistics.tukey_test import tukey_test -from alphastats.utils import LoaderError, check_for_missing_values, ignore_warning plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( layout=plotly.graph_objects.Layout( diff --git a/alphastats/dataset_factory.py b/alphastats/dataset/factory.py similarity index 97% rename from alphastats/dataset_factory.py rename to alphastats/dataset/factory.py index 607f04c2..d06a54f3 100644 --- a/alphastats/dataset_factory.py +++ b/alphastats/dataset/factory.py @@ -5,8 +5,8 @@ import numpy as np import pandas as pd -from alphastats.dataset_harmonizer import DataHarmonizer -from alphastats.keys import Cols +from alphastats.dataset.harmonizer import DataHarmonizer +from alphastats.dataset.keys import Cols class DataSetFactory: diff --git a/alphastats/dataset_harmonizer.py b/alphastats/dataset/harmonizer.py similarity index 95% rename from alphastats/dataset_harmonizer.py rename to alphastats/dataset/harmonizer.py index 9c416690..5ac98682 100644 --- a/alphastats/dataset_harmonizer.py +++ b/alphastats/dataset/harmonizer.py @@ -4,8 +4,8 @@ import pandas as pd -from alphastats.keys import Cols -from alphastats.loader.BaseLoader import BaseLoader +from alphastats.dataset.keys import Cols +from alphastats.loader.base_loader import BaseLoader class DataHarmonizer: diff --git a/alphastats/keys.py b/alphastats/dataset/keys.py similarity index 100% rename from alphastats/keys.py rename to alphastats/dataset/keys.py diff --git a/alphastats/DataSet_Pathway.py b/alphastats/dataset/pathway.py similarity index 99% rename from alphastats/DataSet_Pathway.py rename to alphastats/dataset/pathway.py index b9dd4dc3..b83f5ed0 100644 --- a/alphastats/DataSet_Pathway.py +++ b/alphastats/dataset/pathway.py @@ -7,8 +7,8 @@ import plotly.express as px import requests -from alphastats import AlphaPeptLoader -from alphastats.utils import check_if_df_empty, check_internetconnection +from alphastats.dataset.utils import check_if_df_empty, check_internetconnection +from alphastats.loader.alphapept_loader import AlphaPeptLoader class enrichment_df(pd.DataFrame): diff --git a/alphastats/DataSet_Plot.py b/alphastats/dataset/plotting.py similarity index 97% rename from alphastats/DataSet_Plot.py rename to alphastats/dataset/plotting.py index 5b110f1d..2a85e4b9 100644 --- a/alphastats/DataSet_Plot.py +++ b/alphastats/dataset/plotting.py @@ -7,9 +7,9 @@ import scipy import seaborn as sns -from alphastats.keys import Cols -from alphastats.plots.PlotUtils import PlotUtils -from alphastats.utils import check_for_missing_values +from alphastats.dataset.keys import Cols +from alphastats.dataset.utils import check_for_missing_values +from alphastats.plots.plot_utils import PlotUtils class plotly_object(plotly.graph_objs._figure.Figure): diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/dataset/preprocessing.py similarity index 99% rename from alphastats/DataSet_Preprocess.py rename to alphastats/dataset/preprocessing.py index 9d92a4b0..2d1c854c 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/dataset/preprocessing.py @@ -9,8 +9,8 @@ import streamlit as st from sklearn.experimental import enable_iterative_imputer # noqa -from alphastats.keys import Cols, ConstantsClass -from alphastats.utils import ignore_warning +from alphastats.dataset.keys import Cols, ConstantsClass +from alphastats.dataset.utils import ignore_warning class PreprocessingStateKeys(metaclass=ConstantsClass): diff --git a/alphastats/DataSet_Statistics.py b/alphastats/dataset/statistics.py similarity index 95% rename from alphastats/DataSet_Statistics.py rename to alphastats/dataset/statistics.py index a0e3f66d..8efc9f4e 100644 --- a/alphastats/DataSet_Statistics.py +++ b/alphastats/dataset/statistics.py @@ -4,13 +4,13 @@ import pandas as pd import pingouin -from alphastats.keys import Cols -from alphastats.statistics.Anova import Anova -from alphastats.statistics.DifferentialExpressionAnalysis import ( +from alphastats.dataset.keys import Cols +from alphastats.dataset.utils import ignore_warning +from alphastats.statistics.anova import Anova +from alphastats.statistics.differential_expression_analysis import ( DifferentialExpressionAnalysis, ) -from alphastats.statistics.MultiCovaAnalysis import MultiCovaAnalysis -from alphastats.utils import ignore_warning +from alphastats.statistics.multicova_analysis import MultiCovaAnalysis class Statistics: diff --git a/alphastats/utils.py b/alphastats/dataset/utils.py similarity index 100% rename from alphastats/utils.py rename to alphastats/dataset/utils.py diff --git a/alphastats/gui/pages/02_Import Data.py b/alphastats/gui/pages/02_Import Data.py index 97402f22..610af37b 100644 --- a/alphastats/gui/pages/02_Import Data.py +++ b/alphastats/gui/pages/02_Import Data.py @@ -1,6 +1,6 @@ import streamlit as st -from alphastats.DataSet import DataSet +from alphastats.dataset.dataset import DataSet from alphastats.gui.utils.import_helper import ( load_example_data, load_proteomics_data, diff --git a/alphastats/gui/pages/03_Preprocessing.py b/alphastats/gui/pages/04_Preprocessing.py similarity index 96% rename from alphastats/gui/pages/03_Preprocessing.py rename to alphastats/gui/pages/04_Preprocessing.py index 9b98f61c..314bdf18 100644 --- a/alphastats/gui/pages/03_Preprocessing.py +++ b/alphastats/gui/pages/04_Preprocessing.py @@ -1,6 +1,6 @@ import streamlit as st -from alphastats.DataSet_Preprocess import PreprocessingStateKeys +from alphastats.dataset.preprocessing import PreprocessingStateKeys from alphastats.gui.utils.preprocessing_helper import ( configure_preprocessing, display_preprocessing_info, diff --git a/alphastats/gui/pages/04_Analysis.py b/alphastats/gui/pages/05_Analysis.py similarity index 100% rename from alphastats/gui/pages/04_Analysis.py rename to alphastats/gui/pages/05_Analysis.py diff --git a/alphastats/gui/pages/05_LLM.py b/alphastats/gui/pages/06_LLM.py similarity index 100% rename from alphastats/gui/pages/05_LLM.py rename to alphastats/gui/pages/06_LLM.py diff --git a/alphastats/gui/pages/06_Results.py b/alphastats/gui/pages/07_Results.py similarity index 100% rename from alphastats/gui/pages/06_Results.py rename to alphastats/gui/pages/07_Results.py diff --git a/alphastats/gui/utils/analysis.py b/alphastats/gui/utils/analysis.py index b097c365..5f66a54e 100644 --- a/alphastats/gui/utils/analysis.py +++ b/alphastats/gui/utils/analysis.py @@ -7,10 +7,10 @@ import pandas as pd import streamlit as st -from alphastats.DataSet import DataSet -from alphastats.keys import Cols, ConstantsClass -from alphastats.plots.PlotUtils import PlotlyObject -from alphastats.plots.VolcanoPlot import VolcanoPlot +from alphastats.dataset.dataset import DataSet +from alphastats.dataset.keys import Cols, ConstantsClass +from alphastats.plots.plot_utils import PlotlyObject +from alphastats.plots.volcano_plot import VolcanoPlot class PlottingOptions(metaclass=ConstantsClass): diff --git a/alphastats/gui/utils/analysis_helper.py b/alphastats/gui/utils/analysis_helper.py index 46c5bcb6..00175675 100644 --- a/alphastats/gui/utils/analysis_helper.py +++ b/alphastats/gui/utils/analysis_helper.py @@ -13,7 +13,7 @@ StateKeys, show_button_download_df, ) -from alphastats.plots.PlotUtils import PlotlyObject +from alphastats.plots.plot_utils import PlotlyObject @st.fragment diff --git a/alphastats/gui/utils/import_helper.py b/alphastats/gui/utils/import_helper.py index e7b05f19..d1e8f327 100644 --- a/alphastats/gui/utils/import_helper.py +++ b/alphastats/gui/utils/import_helper.py @@ -7,9 +7,10 @@ import streamlit as st from streamlit.runtime.uploaded_file_manager import UploadedFile -from alphastats.DataSet import DataSet +from alphastats.dataset.dataset import DataSet from alphastats.gui.utils.options import SOFTWARE_OPTIONS -from alphastats.loader.MaxQuantLoader import BaseLoader, MaxQuantLoader +from alphastats.loader.base_loader import BaseLoader +from alphastats.loader.maxquant_loader import MaxQuantLoader def load_proteomics_data(uploaded_file, intensity_column, index_column, software): diff --git a/alphastats/gui/utils/options.py b/alphastats/gui/utils/options.py index 47150480..4bc9c04d 100644 --- a/alphastats/gui/utils/options.py +++ b/alphastats/gui/utils/options.py @@ -1,10 +1,10 @@ -from alphastats import SpectronautLoader -from alphastats.loader.AlphaPeptLoader import AlphaPeptLoader -from alphastats.loader.DIANNLoader import DIANNLoader -from alphastats.loader.FragPipeLoader import FragPipeLoader -from alphastats.loader.GenericLoader import GenericLoader -from alphastats.loader.MaxQuantLoader import MaxQuantLoader -from alphastats.loader.mzTabLoader import mzTabLoader +from alphastats.loader.alphapept_loader import AlphaPeptLoader +from alphastats.loader.diann_loader import DIANNLoader +from alphastats.loader.fragpipe_loader import FragPipeLoader +from alphastats.loader.generic_loader import GenericLoader +from alphastats.loader.maxquant_loader import MaxQuantLoader +from alphastats.loader.mztab_loader import mzTabLoader +from alphastats.loader.spectronaut_loader import SpectronautLoader SOFTWARE_OPTIONS = { "MaxQuant": { diff --git a/alphastats/gui/utils/overview_helper.py b/alphastats/gui/utils/overview_helper.py index 78c0b9d8..1c72a386 100644 --- a/alphastats/gui/utils/overview_helper.py +++ b/alphastats/gui/utils/overview_helper.py @@ -1,7 +1,7 @@ import pandas as pd import streamlit as st -from alphastats.DataSet import DataSet +from alphastats.dataset.dataset import DataSet from alphastats.gui.utils.ui_helper import StateKeys, show_button_download_df diff --git a/alphastats/gui/utils/preprocessing_helper.py b/alphastats/gui/utils/preprocessing_helper.py index 22ecedfe..67c76604 100644 --- a/alphastats/gui/utils/preprocessing_helper.py +++ b/alphastats/gui/utils/preprocessing_helper.py @@ -4,8 +4,8 @@ import streamlit as st from st_cytoscape import cytoscape -from alphastats.DataSet import DataSet -from alphastats.keys import Cols +from alphastats.dataset.dataset import DataSet +from alphastats.dataset.keys import Cols CYTOSCAPE_STYLESHEET = [ { diff --git a/alphastats/gui/utils/ui_helper.py b/alphastats/gui/utils/ui_helper.py index 5a09820b..d56f6e75 100644 --- a/alphastats/gui/utils/ui_helper.py +++ b/alphastats/gui/utils/ui_helper.py @@ -5,8 +5,8 @@ import streamlit as st from alphastats import __version__ +from alphastats.dataset.keys import ConstantsClass from alphastats.gui.utils.preprocessing_helper import PREPROCESSING_STEPS -from alphastats.keys import ConstantsClass # TODO add logo above the options when issue is closed # https://github.com/streamlit/streamlit/issues/4984 diff --git a/alphastats/llm/llm_functions.py b/alphastats/llm/llm_functions.py index d6d1977a..c4d2dff7 100644 --- a/alphastats/llm/llm_functions.py +++ b/alphastats/llm/llm_functions.py @@ -4,7 +4,7 @@ import pandas as pd -from alphastats.DataSet import DataSet +from alphastats.dataset.dataset import DataSet from alphastats.llm.enrichment_analysis import get_enrichment_data from alphastats.llm.uniprot_utils import get_gene_function diff --git a/alphastats/llm/llm_integration.py b/alphastats/llm/llm_integration.py index 47a3cfce..122c6771 100644 --- a/alphastats/llm/llm_integration.py +++ b/alphastats/llm/llm_integration.py @@ -10,7 +10,7 @@ from openai import OpenAI from openai.types.chat import ChatCompletion, ChatCompletionMessageToolCall -from alphastats.DataSet import DataSet +from alphastats.dataset.dataset import DataSet from alphastats.llm.llm_functions import ( GENERAL_FUNCTION_MAPPING, get_assistant_functions, diff --git a/alphastats/llm/prompts.py b/alphastats/llm/prompts.py index 9afdfe83..66443e60 100644 --- a/alphastats/llm/prompts.py +++ b/alphastats/llm/prompts.py @@ -5,7 +5,7 @@ from openai.types.chat import ChatCompletionMessageToolCall -from alphastats.DataSet import DataSet +from alphastats.dataset.dataset import DataSet from alphastats.llm.llm_utils import get_subgroups_for_each_group diff --git a/alphastats/load_data.py b/alphastats/load_data.py deleted file mode 100644 index 6afa01a4..00000000 --- a/alphastats/load_data.py +++ /dev/null @@ -1,22 +0,0 @@ -from alphastats import MaxQuantLoader -from alphastats.loader.AlphaPeptLoader import AlphaPeptLoader -from alphastats.loader.DIANNLoader import DIANNLoader -from alphastats.loader.FragPipeLoader import FragPipeLoader - - -# TODO: Currently only used by tests, but should maybe be used more widely -def load_data(file, type, **kwargs): - type = type.lower() - if type == "maxquant": - loader = MaxQuantLoader(file=file, **kwargs) - elif type == "alphapept": - loader = AlphaPeptLoader(file=file, **kwargs) - elif type == "diann": - loader = DIANNLoader(file=file, **kwargs) - elif type == "fragpipe": - loader = FragPipeLoader(file=file, **kwargs) - else: - raise ValueError( - f"type: {type} is invalid. Choose from maxquant, alphapept, diann, fragpipe" - ) - return loader diff --git a/alphastats/loader/AlphaPeptLoader.py b/alphastats/loader/alphapept_loader.py similarity index 98% rename from alphastats/loader/AlphaPeptLoader.py rename to alphastats/loader/alphapept_loader.py index 186a4cb4..b1da5b9e 100644 --- a/alphastats/loader/AlphaPeptLoader.py +++ b/alphastats/loader/alphapept_loader.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from alphastats.loader.BaseLoader import BaseLoader +from alphastats.loader.base_loader import BaseLoader class AlphaPeptLoader(BaseLoader): diff --git a/alphastats/loader/BaseLoader.py b/alphastats/loader/base_loader.py similarity index 98% rename from alphastats/loader/BaseLoader.py rename to alphastats/loader/base_loader.py index 69839964..5a6f390d 100644 --- a/alphastats/loader/BaseLoader.py +++ b/alphastats/loader/base_loader.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from alphastats.utils import find_duplicates_in_list +from alphastats.dataset.utils import find_duplicates_in_list if sys.version_info >= (3, 9): import importlib.resources as importlib_resources diff --git a/alphastats/loader/DIANNLoader.py b/alphastats/loader/diann_loader.py similarity index 98% rename from alphastats/loader/DIANNLoader.py rename to alphastats/loader/diann_loader.py index 676aa107..1407ab68 100644 --- a/alphastats/loader/DIANNLoader.py +++ b/alphastats/loader/diann_loader.py @@ -1,4 +1,4 @@ -from alphastats.loader.BaseLoader import BaseLoader +from alphastats.loader.base_loader import BaseLoader class DIANNLoader(BaseLoader): diff --git a/alphastats/loader/FragPipeLoader.py b/alphastats/loader/fragpipe_loader.py similarity index 96% rename from alphastats/loader/FragPipeLoader.py rename to alphastats/loader/fragpipe_loader.py index 862490b1..b583482d 100644 --- a/alphastats/loader/FragPipeLoader.py +++ b/alphastats/loader/fragpipe_loader.py @@ -2,7 +2,7 @@ import pandas as pd -from alphastats.loader.BaseLoader import BaseLoader +from alphastats.loader.base_loader import BaseLoader # Philosopher diff --git a/alphastats/loader/GenericLoader.py b/alphastats/loader/generic_loader.py similarity index 97% rename from alphastats/loader/GenericLoader.py rename to alphastats/loader/generic_loader.py index b622ec5a..87549832 100644 --- a/alphastats/loader/GenericLoader.py +++ b/alphastats/loader/generic_loader.py @@ -2,7 +2,7 @@ import pandas as pd -from alphastats.loader.BaseLoader import BaseLoader +from alphastats.loader.base_loader import BaseLoader class GenericLoader(BaseLoader): diff --git a/alphastats/loader/MaxQuantLoader.py b/alphastats/loader/maxquant_loader.py similarity index 98% rename from alphastats/loader/MaxQuantLoader.py rename to alphastats/loader/maxquant_loader.py index d85fb13a..86573a9e 100644 --- a/alphastats/loader/MaxQuantLoader.py +++ b/alphastats/loader/maxquant_loader.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from alphastats.loader.BaseLoader import BaseLoader +from alphastats.loader.base_loader import BaseLoader class MaxQuantLoader(BaseLoader): diff --git a/alphastats/loader/mzTabLoader.py b/alphastats/loader/mztab_loader.py similarity index 95% rename from alphastats/loader/mzTabLoader.py rename to alphastats/loader/mztab_loader.py index 8a5ce19b..b859e6ec 100644 --- a/alphastats/loader/mzTabLoader.py +++ b/alphastats/loader/mztab_loader.py @@ -1,6 +1,6 @@ from pyteomics import mztab -from alphastats.loader.BaseLoader import BaseLoader +from alphastats.loader.base_loader import BaseLoader class mzTabLoader(BaseLoader): diff --git a/alphastats/loader/SpectronautLoader.py b/alphastats/loader/spectronaut_loader.py similarity index 99% rename from alphastats/loader/SpectronautLoader.py rename to alphastats/loader/spectronaut_loader.py index 67112fdb..9840eae2 100644 --- a/alphastats/loader/SpectronautLoader.py +++ b/alphastats/loader/spectronaut_loader.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd -from alphastats.loader.BaseLoader import BaseLoader +from alphastats.loader.base_loader import BaseLoader SPECTRONAUT_COLUMN_DELIM = "." diff --git a/alphastats/multicova/multicova.py b/alphastats/multicova/multicova.py index 0bcfe20f..6d2daeb3 100644 --- a/alphastats/multicova/multicova.py +++ b/alphastats/multicova/multicova.py @@ -15,7 +15,7 @@ from sklearn.preprocessing import StandardScaler from statsmodels.stats.multitest import multipletests -from alphastats.keys import Cols +from alphastats.dataset.keys import Cols # code taken from Isabel Bludau - multicova diff --git a/alphastats/plots/ClusterMap.py b/alphastats/plots/clustermap.py similarity index 95% rename from alphastats/plots/ClusterMap.py rename to alphastats/plots/clustermap.py index 9d122f00..3bada8da 100644 --- a/alphastats/plots/ClusterMap.py +++ b/alphastats/plots/clustermap.py @@ -4,9 +4,9 @@ import pandas as pd import seaborn as sns -from alphastats.DataSet_Statistics import Statistics -from alphastats.keys import Cols -from alphastats.plots.PlotUtils import PlotUtils +from alphastats.dataset.keys import Cols +from alphastats.dataset.statistics import Statistics +from alphastats.plots.plot_utils import PlotUtils class ClusterMap(PlotUtils): diff --git a/alphastats/plots/DimensionalityReduction.py b/alphastats/plots/dimensionality_reduction.py similarity index 97% rename from alphastats/plots/DimensionalityReduction.py rename to alphastats/plots/dimensionality_reduction.py index aa4a3ae5..fd6c8eeb 100644 --- a/alphastats/plots/DimensionalityReduction.py +++ b/alphastats/plots/dimensionality_reduction.py @@ -6,9 +6,9 @@ import plotly.graph_objects as go import sklearn -from alphastats.DataSet_Preprocess import Preprocess -from alphastats.keys import Cols -from alphastats.plots.PlotUtils import PlotlyObject, PlotUtils +from alphastats.dataset.keys import Cols +from alphastats.dataset.preprocessing import Preprocess +from alphastats.plots.plot_utils import PlotlyObject, PlotUtils # make own alphastats theme plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( diff --git a/alphastats/plots/IntensityPlot.py b/alphastats/plots/intensity_plot.py similarity index 98% rename from alphastats/plots/IntensityPlot.py rename to alphastats/plots/intensity_plot.py index 03d90f67..d3d0f717 100644 --- a/alphastats/plots/IntensityPlot.py +++ b/alphastats/plots/intensity_plot.py @@ -8,8 +8,8 @@ import plotly.graph_objects as go import scipy -from alphastats.keys import Cols -from alphastats.plots.PlotUtils import PlotlyObject, PlotUtils +from alphastats.dataset.keys import Cols +from alphastats.plots.plot_utils import PlotlyObject, PlotUtils plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( layout=plotly.graph_objects.Layout( diff --git a/alphastats/plots/PlotUtils.py b/alphastats/plots/plot_utils.py similarity index 100% rename from alphastats/plots/PlotUtils.py rename to alphastats/plots/plot_utils.py diff --git a/alphastats/plots/SampleHistogram.py b/alphastats/plots/sample_histogram.py similarity index 100% rename from alphastats/plots/SampleHistogram.py rename to alphastats/plots/sample_histogram.py diff --git a/alphastats/plots/VolcanoPlot.py b/alphastats/plots/volcano_plot.py similarity index 96% rename from alphastats/plots/VolcanoPlot.py rename to alphastats/plots/volcano_plot.py index 14b15ad5..4249f21b 100644 --- a/alphastats/plots/VolcanoPlot.py +++ b/alphastats/plots/volcano_plot.py @@ -6,19 +6,19 @@ import plotly.express as px import plotly.graph_objects as go -from alphastats.DataSet_Preprocess import PreprocessingStateKeys -from alphastats.DataSet_Statistics import Statistics -from alphastats.keys import Cols +from alphastats.dataset.keys import Cols +from alphastats.dataset.preprocessing import PreprocessingStateKeys +from alphastats.dataset.statistics import Statistics +from alphastats.dataset.utils import ignore_warning from alphastats.multicova import multicova -from alphastats.plots.PlotUtils import PlotlyObject, PlotUtils -from alphastats.statistics.DifferentialExpressionAnalysis import ( +from alphastats.plots.plot_utils import PlotlyObject, PlotUtils +from alphastats.statistics.differential_expression_analysis import ( DifferentialExpressionAnalysis, ) -from alphastats.statistics.StatisticUtils import ( +from alphastats.statistics.statistic_utils import ( add_metadata_column, calculate_foldchange, ) -from alphastats.utils import ignore_warning # TODO this is repeated and needs to go elsewhere! plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( diff --git a/alphastats/statistics/Anova.py b/alphastats/statistics/anova.py similarity index 98% rename from alphastats/statistics/Anova.py rename to alphastats/statistics/anova.py index 12e5f2a0..bae20058 100644 --- a/alphastats/statistics/Anova.py +++ b/alphastats/statistics/anova.py @@ -4,7 +4,7 @@ import scipy from tqdm import tqdm -from alphastats.keys import Cols +from alphastats.dataset.keys import Cols from alphastats.statistics.tukey_test import tukey_test diff --git a/alphastats/statistics/DifferentialExpressionAnalysis.py b/alphastats/statistics/differential_expression_analysis.py similarity index 97% rename from alphastats/statistics/DifferentialExpressionAnalysis.py rename to alphastats/statistics/differential_expression_analysis.py index aff27088..fac08197 100644 --- a/alphastats/statistics/DifferentialExpressionAnalysis.py +++ b/alphastats/statistics/differential_expression_analysis.py @@ -4,10 +4,10 @@ import pandas as pd import scipy -from alphastats.DataSet_Preprocess import PreprocessingStateKeys -from alphastats.keys import Cols +from alphastats.dataset.keys import Cols +from alphastats.dataset.preprocessing import PreprocessingStateKeys from alphastats.multicova import multicova -from alphastats.statistics.StatisticUtils import ( +from alphastats.statistics.statistic_utils import ( add_metadata_column, calculate_foldchange, ) diff --git a/alphastats/statistics/MultiCovaAnalysis.py b/alphastats/statistics/multicova_analysis.py similarity index 99% rename from alphastats/statistics/MultiCovaAnalysis.py rename to alphastats/statistics/multicova_analysis.py index e863ac87..07dc45ee 100644 --- a/alphastats/statistics/MultiCovaAnalysis.py +++ b/alphastats/statistics/multicova_analysis.py @@ -4,7 +4,7 @@ import pandas as pd import plotly.express as px -from alphastats.keys import Cols +from alphastats.dataset.keys import Cols # TODO unused diff --git a/alphastats/statistics/StatisticUtils.py b/alphastats/statistics/statistic_utils.py similarity index 96% rename from alphastats/statistics/StatisticUtils.py rename to alphastats/statistics/statistic_utils.py index 5cfa6729..814dc890 100644 --- a/alphastats/statistics/StatisticUtils.py +++ b/alphastats/statistics/statistic_utils.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from alphastats.keys import Cols +from alphastats.dataset.keys import Cols def calculate_foldchange( diff --git a/alphastats/statistics/tukey_test.py b/alphastats/statistics/tukey_test.py index 637a4a02..8a05ce42 100644 --- a/alphastats/statistics/tukey_test.py +++ b/alphastats/statistics/tukey_test.py @@ -1,7 +1,7 @@ import pandas as pd import pingouin -from alphastats.keys import Cols +from alphastats.dataset.keys import Cols def tukey_test( diff --git a/docs/api_reference/dataset.rst b/docs/api_reference/dataset.rst index 529c6d3d..e8509c2b 100644 --- a/docs/api_reference/dataset.rst +++ b/docs/api_reference/dataset.rst @@ -5,7 +5,7 @@ DataSet DataSet ~~~~~~~~~~~ -.. automodule:: alphastats.DataSet +.. automodule:: alphastats.dataset.dataset.DataSet :members: :undoc-members: :inherited-members: diff --git a/docs/import_data.md b/docs/import_data.md index c5b6bb76..f92b767f 100644 --- a/docs/import_data.md +++ b/docs/import_data.md @@ -12,7 +12,7 @@ maxquant_data = alphastats.MaxQuantLoader( file="testfiles/maxquant_proteinGroups.txt" ) -dataset = alphastats.DataSet( +dataset = alphastats.dataset.dataset.DataSet( loader = maxquant_data, metadata_path_or_df="../testfiles/maxquant/metadata.xlsx", sample_column="sample" @@ -115,7 +115,7 @@ To compare samples across various conditions in the downstream analysis, a metad ## Creating a DataSet -The whole downstream analysis can be performed on the alphastats.DataSet. To create the DataSet you need to provide the loader object as well as the metadata. +The whole downstream analysis can be performed on the alphastats.dataset.dataset.DataSet. To create the DataSet you need to provide the loader object as well as the metadata. ```python import alphastats @@ -124,7 +124,7 @@ maxquant_data = alphastats.MaxQuantLoader( file="testfiles/maxquant_proteinGroups.txt" ) -dataset = alphastats.DataSet( +dataset = alphastats.dataset.dataset.DataSet( loader = maxquant_data, metadata_path_or_df="../testfiles/maxquant/metadata.xlsx", sample_column="sample" diff --git a/nbs/getting_started.ipynb b/nbs/getting_started.ipynb index 556911b9..b9cb9ad3 100644 --- a/nbs/getting_started.ipynb +++ b/nbs/getting_started.ipynb @@ -11,7 +11,7 @@ "\n", "import pandas as pd\n", "\n", - "from alphastats.DataSet import DataSet\n", + "from alphastats.dataset.dataset import DataSet\n", "\n", "warnings.filterwarnings(\n", " \"ignore\", category=UserWarning, module=\"openpyxl\"\n", diff --git a/nbs/liu_2019.ipynb b/nbs/liu_2019.ipynb index 609ddcb7..88d861be 100644 --- a/nbs/liu_2019.ipynb +++ b/nbs/liu_2019.ipynb @@ -21,7 +21,7 @@ "import plotly.io as pio\n", "\n", "import alphastats\n", - "from alphastats.DataSet import DataSet\n", + "from alphastats.dataset.dataset import DataSet\n", "\n", "pio.renderers.default = \"plotly_mimetype+notebook\"" ] @@ -59,7 +59,7 @@ "metadata": {}, "source": [ "We are going to load the `proteinGroups.txt` and the corresponding metadatafile. You can find the data [here](https://github.com/MannLabs/alphapeptstats/tree/main/testfiles/maxquant) or on ProteomeXchange [PXD011839](http://proteomecentral.proteomexchange.org/cgi/GetDataset?ID=PXD011839).\n", - "To load the proteomis data you need to create a loader object using `alphastats.MaxQuantLoader`. The whole downstream analysis will be performed on a `alphastats.DataSet`. To create the DataSet you need to provide the loader object as well as the metadata." + "To load the proteomis data you need to create a loader object using `alphastats.MaxQuantLoader`. The whole downstream analysis will be performed on a `alphastats.dataset.dataset.DataSet`. To create the DataSet you need to provide the loader object as well as the metadata." ] }, { diff --git a/nbs/ramus_2016.ipynb b/nbs/ramus_2016.ipynb index a0b3fdf7..98d4757c 100644 --- a/nbs/ramus_2016.ipynb +++ b/nbs/ramus_2016.ipynb @@ -87,7 +87,7 @@ " filter_columns=[],\n", ")\n", "\n", - "ds = alphastats.DataSet(\n", + "ds = alphastats.dataset.dataset.DataSet(\n", " loader=loader, metadata_path_or_df=\"metadata.csv\", sample_column=\"sample\"\n", ")" ] diff --git a/tests/gui/conftest.py b/tests/gui/conftest.py index 43c95d13..1388f621 100644 --- a/tests/gui/conftest.py +++ b/tests/gui/conftest.py @@ -3,8 +3,11 @@ from streamlit.testing.v1 import AppTest -from alphastats.DataSet import DataSet -from alphastats.load_data import load_data +from alphastats.dataset.dataset import DataSet +from alphastats.loader.alphapept_loader import AlphaPeptLoader +from alphastats.loader.diann_loader import DIANNLoader +from alphastats.loader.fragpipe_loader import FragPipeLoader +from alphastats.loader.maxquant_loader import MaxQuantLoader # TODO: Turn the helpers into fixtures @@ -21,9 +24,26 @@ def print_session_state(apptest: AppTest): ) +def _load_data(file, type, **kwargs): + type = type.lower() + if type == "maxquant": + loader = MaxQuantLoader(file=file, **kwargs) + elif type == "alphapept": + loader = AlphaPeptLoader(file=file, **kwargs) + elif type == "diann": + loader = DIANNLoader(file=file, **kwargs) + elif type == "fragpipe": + loader = FragPipeLoader(file=file, **kwargs) + else: + raise ValueError( + f"type: {type} is invalid. Choose from maxquant, alphapept, diann, fragpipe" + ) + return loader + + def create_dataset_alphapept(): """Creates a dataset object from the alphapept testfiles.""" - loader = load_data( + loader = _load_data( file=str(TEST_INPUT_FILES_PATH / "alphapept/results_proteins.csv"), type="alphapept", ) diff --git a/tests/gui/test_02_import_data.py b/tests/gui/test_02_import_data.py index 7e5915fa..1dfbc411 100644 --- a/tests/gui/test_02_import_data.py +++ b/tests/gui/test_02_import_data.py @@ -53,7 +53,7 @@ def test_page_02_loads_example_data(mock_page_link: MagicMock): ] assert ( str(type(at.session_state[StateKeys.DATASET])) - == "" + == "" ) diff --git a/tests/gui/test_04_preprocessing.py b/tests/gui/test_04_preprocessing.py index 05bd4578..3ccdefc7 100644 --- a/tests/gui/test_04_preprocessing.py +++ b/tests/gui/test_04_preprocessing.py @@ -4,7 +4,7 @@ from .conftest import APP_FOLDER, create_dataset_alphapept -TESTED_PAGE = f"{APP_FOLDER}/pages/03_Preprocessing.py" +TESTED_PAGE = f"{APP_FOLDER}/pages/04_Preprocessing.py" def test_page_04_loads_without_input(): diff --git a/tests/llm/test_llm_functions.py b/tests/llm/test_llm_functions.py index c0424fa8..365b1d24 100644 --- a/tests/llm/test_llm_functions.py +++ b/tests/llm/test_llm_functions.py @@ -9,7 +9,7 @@ import pandas as pd -from alphastats.DataSet import DataSet +from alphastats.dataset.dataset import DataSet from alphastats.llm.llm_functions import ( GENERAL_FUNCTION_MAPPING, get_assistant_functions, diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index f483fe84..d35e96bc 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -10,16 +10,16 @@ import pandas as pd import plotly -from alphastats.DataSet import DataSet -from alphastats.dataset_factory import DataSetFactory -from alphastats.DataSet_Preprocess import PreprocessingStateKeys -from alphastats.loader.AlphaPeptLoader import AlphaPeptLoader -from alphastats.loader.DIANNLoader import DIANNLoader -from alphastats.loader.FragPipeLoader import FragPipeLoader -from alphastats.loader.GenericLoader import GenericLoader -from alphastats.loader.MaxQuantLoader import MaxQuantLoader -from alphastats.loader.SpectronautLoader import SpectronautLoader -from alphastats.utils import LoaderError +from alphastats.dataset.dataset import DataSet +from alphastats.dataset.factory import DataSetFactory +from alphastats.dataset.preprocessing import PreprocessingStateKeys +from alphastats.dataset.utils import LoaderError +from alphastats.loader.alphapept_loader import AlphaPeptLoader +from alphastats.loader.diann_loader import DIANNLoader +from alphastats.loader.fragpipe_loader import FragPipeLoader +from alphastats.loader.generic_loader import GenericLoader +from alphastats.loader.maxquant_loader import MaxQuantLoader +from alphastats.loader.spectronaut_loader import SpectronautLoader logger = logging.getLogger(__name__) @@ -491,7 +491,7 @@ def test_preprocess_subset(self): self.obj.preprocess(subset=True) self.assertEqual(self.obj.mat.shape[0], 48) - @patch("alphastats.DataSet.DataSet.tukey_test") + @patch("alphastats.dataset.dataset.DataSet.tukey_test") def test_anova_without_tukey(self, mock): # TODO: Check why 4 extra rows are generated here. This is not due to changes made to 0 and nan filtering. anova_results = self.obj.anova(column="disease", protein_ids="all", tukey=False) diff --git a/tests/test_DataSet_Pathway.py b/tests/test_DataSet_Pathway.py index 7aa441f4..f3fa7a90 100644 --- a/tests/test_DataSet_Pathway.py +++ b/tests/test_DataSet_Pathway.py @@ -3,9 +3,9 @@ import numpy as np import pandas as pd -from alphastats.DataSet import DataSet -from alphastats.DataSet_Pathway import Enrichment -from alphastats.loader.MaxQuantLoader import MaxQuantLoader +from alphastats.dataset.dataset import DataSet +from alphastats.dataset.pathway import Enrichment +from alphastats.loader.maxquant_loader import MaxQuantLoader class BaseTestDataSet: diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 00000000..d35e96bc --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,1062 @@ +import copy +import logging +import os +import shutil +import unittest +from unittest import skip +from unittest.mock import patch + +import numpy as np +import pandas as pd +import plotly + +from alphastats.dataset.dataset import DataSet +from alphastats.dataset.factory import DataSetFactory +from alphastats.dataset.preprocessing import PreprocessingStateKeys +from alphastats.dataset.utils import LoaderError +from alphastats.loader.alphapept_loader import AlphaPeptLoader +from alphastats.loader.diann_loader import DIANNLoader +from alphastats.loader.fragpipe_loader import FragPipeLoader +from alphastats.loader.generic_loader import GenericLoader +from alphastats.loader.maxquant_loader import MaxQuantLoader +from alphastats.loader.spectronaut_loader import SpectronautLoader + +logger = logging.getLogger(__name__) + + +class BaseTestDataSet: + # parent class of test loader for common tests among loaders + # this is wrapped in a nested class so it doesnt get called separatly when testing + # plus to avoid multiple inheritance + class BaseTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.loader = None + self.obj = None + self.metadata_path = None + self.matrix_dim = None + self.matrix_dim_filtered = None + self.comparison_column = None + + def test_check_loader_no_error(self): + self.obj._check_loader(loader=self.loader) + # nothing raised -> ok + + def test_check_loader_error_invalid_column(self): + # invalid index column + with self.assertRaises(ValueError): + self.loader.index_column = 100 + self.obj._check_loader(loader=self.loader) + + def test_check_loader_error_empty_df(self): + # empty dataframe + with self.assertRaises(ValueError): + self.loader.rawinput = pd.DataFrame() + self.obj._check_loader(loader=self.loader) + + def test_check_loader_error_invalid_loader(self): + # invalid loader, class + with self.assertRaises(LoaderError): + df = pd.DataFrame() + self.obj._check_loader(loader=df) + + def test_load_metadata(self): + # is dataframe loaded + self.assertIsInstance(self.obj.metadata, pd.DataFrame) + self.assertFalse(self.obj.metadata.empty) + + @patch("logging.Logger.warning") + def test_load_metadata_warning(self, mock): + # is dataframe None and is warning produced + file_path = "wrong/file.xxx" + self.obj._dataset_factory._load_metadata(file_path=file_path) + mock.assert_called_once() + + def test_create_matrix(self): + # matrix dimensions + self.assertEqual(self.obj.mat.shape, self.matrix_dim) + # does the matrix only contain floats/integers and NAs + is_dtype_numeric = list( + set(list(map(pd.api.types.is_numeric_dtype, self.obj.mat.dtypes))) + ) + self.assertEqual(is_dtype_numeric, [True]) + + @patch("logging.Logger.warning") + def test_check_values_warning(self, mock): + # is dataframe None and is warning produced + data = { + "A": [10, 11, 12, 13, 14], + "B": [23, 22, 24, 22, 25], + "C": [66, 72, np.inf, 68, -np.inf], + } + mat = pd.DataFrame(data) + DataSetFactory._check_matrix_values(mat) + mock.assert_called_once() + + @patch("logging.Logger.info") + def test_preprocess_filter(self, mock): + # is info printed if contamination columns get removed + # is the new matrix smaller than the older matrix + self.obj.preprocess(remove_contaminations=True) + self.assertEqual(self.obj.mat.shape, self.matrix_dim_filtered) + # info has been printed at least once + mock.assert_called_once() + + @patch("logging.Logger.info") + def test_preprocess_filter_already_filter(self, mock): + # is warning raised when filter columns are none + # is info printed if contamination columns get removed + # is the new matrix smaller than the older matrix + self.assertFalse( + self.obj.preprocessing_info.get( + PreprocessingStateKeys.CONTAMINATIONS_REMOVED + ) + ) + self.obj.preprocess(remove_contaminations=True) + self.assertEqual(self.obj.mat.shape, self.matrix_dim_filtered) + self.assertTrue( + self.obj.preprocessing_info.get( + PreprocessingStateKeys.CONTAMINATIONS_REMOVED + ) + ) + self.obj.preprocess(remove_contaminations=True) + self.assertEqual(self.obj.mat.shape, self.matrix_dim_filtered) + + @patch("logging.Logger.info") + def test_preprocess_filter_no_filter_columns(self, mock): + self.obj.filter_columns = [] + self.obj.preprocess(remove_contaminations=True) + mock.assert_called_once() + + def test_preprocess_normalization_invalid_method(self): + """ + Raises Error when method is not available for Normalization + """ + with self.assertRaises(ValueError): + self.obj.preprocess(normalization="wrong method") + + def test_preprocess_imputation_invalid_method(self): + with self.assertRaises(ValueError): + self.obj.preprocess(imputation="wrong method") + + def test_imputation_mean(self): + self.obj.preprocess(imputation="mean") + self.assertFalse(self.obj.mat.isna().values.any()) + + def test_imputation_median(self): + self.obj.preprocess(imputation="median") + self.assertFalse(self.obj.mat.isna().values.any()) + + def test_imputation_knn(self): + self.obj.preprocess(imputation="knn") + self.assertFalse(self.obj.mat.isna().values.any()) + + def test_plot_sampledistribution_wrong_method(self): + """ + Raises Error when method is not available for plotting Sampledistribution + """ + with self.assertRaises(ValueError): + self.obj.plot_sampledistribution(method="wrong_method") + + def test_plot_sampledistribution(self): + plot = self.obj.plot_sampledistribution(log_scale=True) + # check if it is a figure + self.assertIsInstance(plot, plotly.graph_objects.Figure) + # convert plotly objec to dict + plot_dict = plot.to_plotly_json() + # check if plotly object is not empty + self.assertEqual(len(plot_dict.get("data")), 1) + # check if it is logscale + self.assertEqual(plot_dict.get("layout").get("yaxis").get("type"), "log") + + def test_reset_preprocessing(self): + self.assertEqual(self.obj.mat.shape, self.matrix_dim) + + self.obj.preprocess(remove_contaminations=True) + self.assertEqual(self.obj.mat.shape, self.matrix_dim_filtered) + + self.obj.reset_preprocessing() + self.assertEqual(self.obj.mat.shape, self.matrix_dim) + + +class TestAlphaPeptDataSet(BaseTestDataSet.BaseTest): + # do testing which requires extra files only on TestAlphaPeptDataSet + # to reduce the amount of compariosn files required + def setUp(self): + self.loader = AlphaPeptLoader(file="testfiles/alphapept/results_proteins.csv") + self.metadata_path = "testfiles/alphapept/metadata.csv" + self.obj = DataSet( + loader=self.loader, + metadata_path_or_df=self.metadata_path, + sample_column="sample", + ) + # expected dimensions of matrix + self.matrix_dim = (2, 3781) + self.matrix_dim_filtered = (2, 3707) + # metadata column to compare for PCA, t-test, etc. + self.comparison_column = "disease" + + def test_dataset_without_metadata(self): + obj = DataSet(loader=self.loader) + self.assertEqual(obj.mat.shape[0], obj.metadata.shape[0]) + + def test_load_metadata_fileformats(self): + # test if different fileformats get loaded correctly + metadata_path = "testfiles/alphapept/metadata.txt" + self.obj._dataset_factory._load_metadata(file_path=metadata_path) + self.assertEqual(self.obj.metadata.shape, (2, 2)) + + metadata_path = "testfiles/alphapept/metadata.tsv" + self.obj._dataset_factory._load_metadata(file_path=metadata_path) + self.assertEqual(self.obj.metadata.shape, (2, 2)) + + metadata_path = "testfiles/alphapept/metadata.csv" + self.obj._dataset_factory._load_metadata(file_path=metadata_path) + self.assertEqual(self.obj.metadata.shape, (2, 2)) + + @patch("logging.Logger.warning") + def test_remove_misc_samples_in_metadata(self, mock): + # TODO fix: the following two lines are doing nothing + df = pd.DataFrame( + {"sample": ["A", "B", "C"], "b": ["disease", "health", "disease"]} + ) + _ = DataSet( + loader=self.loader, + metadata_path_or_df=df, + sample_column="sample", + ) + # is sample C removed + self.assertEqual(self.obj.metadata.shape, (2, 2)) + mock.assert_called_once() + + def test_load_metadata_df(self): + if self.metadata_path.endswith(".csv"): + df = pd.read_csv(self.metadata_path) + else: + df = pd.read_excel(self.metadata_path) + obj = DataSet( + loader=self.loader, + metadata_path_or_df=df, + sample_column="sample", + ) + self.assertIsInstance(obj.metadata, pd.DataFrame) + self.assertFalse(obj.metadata.empty) + + def test_preprocess_remove_samples(self): + sample_list = ["A"] + self.obj.preprocess(remove_samples=sample_list) + self.assertEqual(self.obj.mat.shape, (1, 3781)) + + def test_preprocess_normalize_zscore(self): + self.obj.mat = pd.DataFrame({"a": [2, 5, 4], "b": [5, 4, 4], "c": [0, 10, 8]}) + # zscore Normalization + self.obj.preprocess(log2_transform=False, normalization="zscore") + expected_mat = pd.DataFrame( + { + "a": [-0.162221, -0.508001, -0.707107], + "b": [1.297771, -0.889001, -0.707107], + "c": [-1.135550, 1.397001, 1.414214], + } + ) + pd._testing.assert_frame_equal(self.obj.mat, expected_mat) + + def test_preprocess_normalize_quantile(self): + self.obj.mat = pd.DataFrame({"a": [2, 5, 4], "b": [5, 4, 4], "c": [0, 10, 8]}) + # Quantile Normalization + self.obj.preprocess(log2_transform=False, normalization="quantile") + expected_mat = pd.DataFrame( + {"a": [0.5, 0.5, 0.0], "b": [1.0, 0.0, 0.0], "c": [0.0, 1.0, 1.0]} + ) + pd._testing.assert_frame_equal(self.obj.mat, expected_mat) + + def test_preprocess_normalize_linear(self): + # !!! normalizes by row and not by feature + self.obj.mat = pd.DataFrame({"a": [2, 5, 4], "b": [5, 4, 4], "c": [0, 10, 8]}) + # Linear Normalization + self.obj.preprocess(log2_transform=False, normalization="linear") + expected_mat = pd.DataFrame( + { + "a": [0.37139068, 0.42107596, 0.40824829], + "b": [0.92847669, 0.33686077, 0.40824829], + "c": [0.0, 0.84215192, 0.81649658], + } + ) + pd._testing.assert_frame_equal(self.obj.mat, expected_mat) + + def test_preprocess_normalize_vst(self): + self.obj.mat = pd.DataFrame({"a": [2, 5, 4], "b": [5, 4, 4], "c": [0, 10, 8]}) + # Linear Normalization + self.obj.preprocess(log2_transform=False, normalization="vst") + expected_mat = pd.DataFrame( + { + "a": [-0.009526, -0.236399, -0.707107], + "b": [1.229480, -1.089313, -0.707107], + "c": [-1.219954, 1.325712, 1.414214], + } + ) + pd._testing.assert_frame_equal(self.obj.mat.round(2), expected_mat.round(2)) + + def test_preprocess_imputation_mean_values(self): + self.obj.mat = pd.DataFrame( + {"a": [2, np.nan, 4], "b": [5, 4, 4], "c": [np.nan, 10, np.nan]} + ) + self.obj.preprocess(log2_transform=False, imputation="mean") + expected_mat = pd.DataFrame( + {"a": [2.0, 3.0, 4.0], "b": [5.0, 4.0, 4.0], "c": [10.0, 10.0, 10.0]} + ) + pd._testing.assert_frame_equal(self.obj.mat, expected_mat) + + def test_preprocess_imputation_median_values(self): + self.obj.mat = pd.DataFrame( + {"a": [2, np.nan, 4], "b": [5, 4, 4], "c": [np.nan, 10, np.nan]} + ) + self.obj.preprocess(log2_transform=False, imputation="median") + expected_mat = pd.DataFrame( + {"a": [2.0, 3.0, 4.0], "b": [5.0, 4.0, 4.0], "c": [10.0, 10.0, 10.0]} + ) + pd._testing.assert_frame_equal(self.obj.mat, expected_mat) + + def test_preprocess_imputation_knn_values(self): + self.obj.mat = pd.DataFrame( + {"a": [2, np.nan, 4], "b": [5, 4, 4], "c": [np.nan, 10, np.nan]} + ) + self.obj.preprocess(log2_transform=False, imputation="knn") + expected_mat = pd.DataFrame( + {"a": [2.0, 3.0, 4.0], "b": [5.0, 4.0, 4.0], "c": [10.0, 10.0, 10.0]} + ) + pd._testing.assert_frame_equal(self.obj.mat, expected_mat) + + def test_preprocess_imputation_randomforest_values(self): + self.obj.mat = pd.DataFrame( + {"a": [2, np.nan, 4], "b": [5, 4, 4], "c": [np.nan, 10, np.nan]} + ) + self.obj.preprocess(log2_transform=False, imputation="randomforest") + expected_mat = pd.DataFrame( + { + "a": [2.0, 3.0, 4.0], + "b": [5.0, 4.0, 4.0], + "c": [10.0, 10.0, 10.0], + } + ) + pd._testing.assert_frame_equal(self.obj.mat, expected_mat) + + def test_plot_sampledistribution_group(self): + plot = self.obj.plot_sampledistribution( + method="box", color="disease", log_scale=False + ) + # check if it is a figure + self.assertIsInstance(plot, plotly.graph_objects.Figure) + # convert plotly object to dict + plot_dict = plot.to_plotly_json() + # check if it doesnt get transformed to logscale + self.assertEqual(plot_dict.get("layout").get("yaxis").get("type"), None) + # check if there are two groups control and disease + self.assertEqual(plot_dict.get("data")[0].get("legendgroup"), "control") + # check that it is boxplot and not violinplot + is_boxplot = "boxmode" in plot_dict.get("layout") + self.assertTrue(is_boxplot) + + def test_plot_correlation_matrix(self): + plot = self.obj.plot_correlation_matrix() + plot_dict = plot.to_plotly_json() + correlation_calculations_expected = [1.0, 0.999410773629427] + self.assertEqual( + plot_dict.get("data")[0].get("z")[0].tolist(), + correlation_calculations_expected, + ) + + def test_plot_clustermap(self): + self.obj.preprocess(log2_transform=False, imputation="knn") + plot = self.obj.plot_clustermap() + first_row = plot.data2d.iloc[0].to_list() + expected = [487618.5371077078, 1293013.103298046] + self.assertEqual(first_row, expected) + + def test_plot_clustermap_with_label_bar(self): + self.obj.preprocess(log2_transform=False, imputation="knn") + plot = self.obj.plot_clustermap(label_bar=self.comparison_column) + first_row = plot.data2d.iloc[0].to_list() + expected = [487618.5371077078, 1293013.103298046] + self.assertEqual(first_row, expected) + + +class TestMaxQuantDataSet(BaseTestDataSet.BaseTest): + def setUp(self): + self.loader = MaxQuantLoader(file="testfiles/maxquant/proteinGroups.txt") + self.metadata_path = "testfiles/maxquant/metadata.xlsx" + self.obj: DataSet = DataSet( + loader=self.loader, + metadata_path_or_df=self.metadata_path, + sample_column="sample", + ) + # expected dimensions of matrix + self.matrix_dim = (312, 2611) + self.matrix_dim_filtered = (312, 2409) + self.comparison_column = "disease" + + def test_load_evidence_wrong_sample_names(self): + with self.assertRaises(ValueError): + loader = MaxQuantLoader( + file="testfiles/maxquant/proteinGroups.txt", + evidence_file="testfiles/maxquant_go/evidence.txt", + ) + DataSet( + loader=loader, + metadata_path_or_df=self.metadata_path, + sample_column="sample", + ) + + def test_plot_pca_group(self): + pca_plot = self.obj.plot_pca(group=self.comparison_column) + # 5 different disease + self.assertEqual(len(pca_plot.to_plotly_json().get("data")), 5) + + def test_data_completeness(self): + self.obj.preprocess( + log2_transform=False, replace_zeroes=True, data_completeness=0.7 + ) + self.assertEqual(self.obj.mat.shape[1], 159) + + def test_plot_pca_circles(self): + pca_plot = self.obj.plot_pca(group=self.comparison_column, circle=True) + # are there 5 circles test_preprocess_imputation_randomforest_values - for each group + number_of_groups = len(pca_plot.to_plotly_json().get("layout").get("shapes")) + self.assertEqual(number_of_groups, 5) + + def test_plot_umap_group(self): + umap_plot = self.obj.plot_umap(group=self.comparison_column) + # 5 different disease + self.assertEqual(len(umap_plot.to_plotly_json().get("data")), 5) + + def test_plot_umap_circles(self): + umap_plot = self.obj.plot_umap(group=self.comparison_column, circle=True) + # are there 5 circles drawn - for each group + number_of_groups = len(umap_plot.to_plotly_json().get("layout").get("shapes")) + self.assertEqual(number_of_groups, 5) + + def test_plot_volcano_with_grouplist(self): + self.obj.plot_volcano( + method="ttest", + group1=["1_31_C6", "1_32_C7", "1_57_E8"], + group2=["1_71_F10", "1_73_F12"], + ) + + def test_plot_volcano_with_grouplist_wrong_names(self): + with self.assertRaises(ValueError): + self.obj.plot_volcano( + method="ttest", + group1=["wrong_sample_name", "1_42_D9", "1_57_E8"], + group2=["1_71_F10", "1_73_F12"], + ) + + @skip # TODO reimplement compare_preprocessing_modes + def test_plot_volcano_compare_preprocessing_modes_no_randomforest(self): + obj_ut = DataSet( + loader=self.loader, + metadata_path_or_df=self.metadata_path, + sample_column="sample", + ) + + # 'randomforest' makes this test very costly + obj_ut.imputation_methods.remove("randomforest") + + result_list = obj_ut.plot_volcano( + method="ttest", + group1=["1_31_C6", "1_32_C7", "1_57_E8"], + group2=["1_71_F10", "1_73_F12"], + compare_preprocessing_modes=True, + ) + self.assertEqual(len(result_list), 3 * 3) + + @skip # TODO speed up this test (e.g. by reducing the number of samples) + def test_plot_volcano_compare_preprocessing_modes_randomforest(self): + obj_ut = DataSet( + loader=self.loader, + metadata_path_or_df=self.metadata_path, + sample_column="sample", + ) + + obj_ut.imputation_methods = ["randomforest"] + + result_list = obj_ut.plot_volcano( + method="ttest", + group1=["1_31_C6", "1_32_C7", "1_57_E8"], + group2=["1_71_F10", "1_73_F12"], + compare_preprocessing_modes=True, + ) + self.assertEqual(len(result_list), 3) + + def test_preprocess_subset(self): + self.obj.preprocess(subset=True) + self.assertEqual(self.obj.mat.shape[0], 48) + + @patch("alphastats.dataset.dataset.DataSet.tukey_test") + def test_anova_without_tukey(self, mock): + # TODO: Check why 4 extra rows are generated here. This is not due to changes made to 0 and nan filtering. + anova_results = self.obj.anova(column="disease", protein_ids="all", tukey=False) + self.assertEqual(anova_results["ANOVA_pvalue"][1], 0.4469688936240973) + self.assertEqual(anova_results.shape, (self.matrix_dim[1] + 4, 2)) + # check if tukey isnt called + mock.assert_not_called() + + def test_plot_intenstity_subgroup(self): + plot = self.obj.plot_intensity( + protein_id="K7ERI9;A0A024R0T8;P02654;K7EJI9;K7ELM9;K7EPF9;K7EKP1", + group="disease", + subgroups=["healthy", "liver cirrhosis"], + add_significance=True, + ) + plot_dict = plot.to_plotly_json() + self.assertEqual(len(plot_dict.get("data")), 3) + + def test_plot_intensity_subgroup_gracefully_handle_one_group(self): + plot = self.obj.plot_intensity( + protein_id="K7ERI9;A0A024R0T8;P02654;K7EJI9;K7ELM9;K7EPF9;K7EKP1", + group="disease", + add_significance=True, + ) + plot_dict = plot.to_plotly_json() + self.assertEqual(len(plot_dict.get("data")), 5) + + def test_anova_with_tukey(self): + # with first 100 protein ids + self.obj.preprocess(data_completeness=0.05, imputation="mean") + id_list = self.obj.mat.columns.tolist()[0:100] + results = self.obj.anova(column="disease", protein_ids=id_list, tukey=True) + self.assertEqual(results.shape, (100, 10)) + + # with one protein id + protein_id = "A0A024R4J8;Q92876" + results = self.obj.anova(column="disease", protein_ids=protein_id, tukey=True) + self.assertEqual(results.shape[1], 10) + + def test_tukey_test(self): + protein_id = "K7ERI9;A0A024R0T8;P02654;K7EJI9;K7ELM9;K7EPF9;K7EKP1" + tukey_df = self.obj.tukey_test(protein_id=protein_id, group="disease") + self.assertEqual(tukey_df["p-tukey"][0], 0.674989009816342) + + def test_ancova(self): + ancova_df = self.obj.ancova( + protein_id="K7ERI9;A0A024R0T8;P02654;K7EJI9;K7ELM9;K7EPF9;K7EKP1", + covar="Triglycerides measurement (14740000)", + between="disease", + ) + expected_value = 0.7375624497867097 + given_value = ancova_df["p-unc"][1] + decimal_places = 7 + self.assertAlmostEqual(expected_value, given_value, decimal_places) + + @skip + def test_plot_volcano_with_labels(self): + plot = self.obj.plot_volcano( + column="disease", + group1="healthy", + group2="liver cirrhosis", + method="ttest", + labels=True, + draw_line=False, + ) + n_labels = len(plot.to_plotly_json().get("layout").get("annotations")) + self.assertTrue(n_labels > 5) + + def test_plot_volcano_wald(self): + """ + Volcano Plot with wald test and list of samples + """ + self.obj.preprocess(imputation="knn") + self.obj.plot_volcano( + group1=["1_31_C6", "1_32_C7", "1_33_C8"], + group2=["1_78_G5", "1_77_G4", "1_76_G3"], + method="ttest", + ) + + column_added = "_comparison_column" in self.obj.metadata.columns.to_list() + self.assertTrue(column_added) + + def test_plot_volcano_sam(self): + self.obj.preprocess( + log2_transform=False, imputation="knn", normalization="zscore" + ) + plot = self.obj.plot_volcano( + column="disease", + group1="type 2 diabetes mellitus", + group2="type 2 diabetes mellitus|non-alcoholic fatty liver disease", + method="sam", + draw_line=True, + perm=10, + ) + + # fdr lines get drawn + line_1 = plot.to_plotly_json()["data"][-2].get("line").get("shape") + line_2 = plot.to_plotly_json()["data"][-1].get("line").get("shape") + + self.assertEqual(line_1, "spline") + self.assertEqual(line_2, "spline") + + def test_plot_volcano_list(self): + self.obj.preprocess(imputation="mean") + plot = self.obj.plot_volcano( + method="ttest", + group1=["1_31_C6", "1_32_C7", "1_57_E8"], + group2=["1_71_F10", "1_73_F12"], + color_list=self.obj.mat.columns.to_list()[0:20], + ) + self.assertEqual(len(plot.to_plotly_json()["data"][0]["x"]), 1) + + def test_plot_clustermap_significant(self): + import sys + + sys.setrecursionlimit(100000) + self.obj.preprocess(imputation="knn") + self.obj.plot_clustermap( + label_bar=self.comparison_column, + only_significant=True, + group=self.comparison_column, + subgroups=["healthy", "liver cirrhosis"], + ) + + def test_plot_volcano_with_labels_proteins(self): + # remove gene names + self.obj.gene_names = None + plot = self.obj.plot_volcano( + column="disease", + group1="healthy", + group2="liver cirrhosis", + method="ttest", + labels=True, + ) + n_labels = len(plot.to_plotly_json().get("layout").get("annotations")) + self.assertEqual(n_labels, 9) + + def test_plot_volcano_with_labels_proteins_welch_ttest(self): + # remove gene names + self.obj.gene_names = None + plot = self.obj.plot_volcano( + column="disease", + group1="healthy", + group2="liver cirrhosis", + method="welch-ttest", + labels=True, + ) + n_labels = len(plot.to_plotly_json().get("layout").get("annotations")) + self.assertTrue(n_labels > 20) + + def test_calculate_diff_exp_wrong(self): + # get groups from comparison column + with self.assertRaises(ValueError): + self.obj.preprocess(imputation="knn") + groups = list(set(self.obj.metadata[self.comparison_column].to_list())) + group1, group2 = groups[0], groups[1] + + self.obj.diff_expression_analysis( + column=self.comparison_column, + group1=group1, + group2=group2, + method="wrong_method", + ) # check if dataframe gets created + + def test_diff_expression_analysis_nocolumn(self): + with self.assertRaises(ValueError): + self.obj.diff_expression_analysis( + group1="healthy", group2="liver cirrhosis" + ) + + def test_diff_expression_analysis_list(self): + self.obj.diff_expression_analysis( + group1=["1_31_C6", "1_32_C7", "1_33_C8"], + group2=["1_78_G5", "1_77_G4", "1_76_G3"], + method="ttest", + ) + + column_added = "_comparison_column" in self.obj.metadata.columns.to_list() + self.assertTrue(column_added) + + def test_plot_intensity_non_sign(self): + """ + No significant label is added to intensity plot + """ + plot = self.obj.plot_intensity( + protein_id="S6BAR0", + group="disease", + subgroups=["liver cirrhosis", "healthy"], + add_significance=True, + ) + + annotation = ( + plot.to_plotly_json().get("layout").get("annotations")[1].get("text") + ) + self.assertEqual(annotation, "-") + + def test_plot_intensity_sign(self): + """ + Significant label * is added to intensity plot + """ + plot = self.obj.plot_intensity( + protein_id="Q9UL94", + group="disease", + subgroups=["liver cirrhosis", "healthy"], + add_significance=True, + ) + + annotation = ( + plot.to_plotly_json().get("layout").get("annotations")[1].get("text") + ) + self.assertEqual(annotation, "*") + + def test_plot_intensity_sign_01(self): + """ + Significant label ** is added to intensity plot + """ + plot = self.obj.plot_intensity( + protein_id="Q96JD0;Q96JD1;P01721", + group="disease", + subgroups=["liver cirrhosis", "healthy"], + add_significance=True, + ) + + annotation = ( + plot.to_plotly_json().get("layout").get("annotations")[1].get("text") + ) + self.assertEqual(annotation, "**") + + def test_plot_intensity_sign_001(self): + """ + Highly significant label is added to intensity plot + """ + plot = self.obj.plot_intensity( + protein_id="Q9BWP8", + group="disease", + subgroups=["liver cirrhosis", "healthy"], + add_significance=True, + ) + + annotation = ( + plot.to_plotly_json().get("layout").get("annotations")[1].get("text") + ) + self.assertEqual(annotation, "***") + + def test_plot_intensity_all(self): + plot = self.obj.plot_intensity( + protein_id="Q9BWP8", + group="disease", + subgroups=["liver cirrhosis", "healthy"], + method="all", + add_significance=True, + ) + self.assertEqual(plot.to_plotly_json()["data"][0]["points"], "all") + + def test_plot_samplehistograms(self): + fig = self.obj.plot_samplehistograms().to_plotly_json() + self.assertEqual(312, len(fig["data"])) + + def test_batch_correction(self): + self.obj.preprocess( + subset=True, replace_zeroes=True, data_completeness=0.1, imputation="knn" + ) + self.obj.batch_correction(batch="batch_artifical_added") + # TODO: check if batch correction worked, but not by np.isclose, as this will change whenever soemthing else about preprocessing is changed + first_value = self.obj.mat.values[0, 0] + self.assertTrue(np.isclose(150490495.32554176, first_value)) + + def test_multicova_analysis_invalid_covariates(self): + self.obj.preprocess(imputation="knn", normalization="zscore", subset=True) + res, _ = self.obj.multicova_analysis( + covariates=[ + "disease", + "Alkaline phosphatase measurement", + "Body mass index ", + "not here", + ], + subset={"disease": ["healthy", "liver cirrhosis"]}, + ) + self.assertEqual(res.shape[1], 45) + + def test_get_protein_id_for_gene_name(self): + self.assertEqual( + self.obj._get_protein_id_for_gene_name("MADE_UP_GENE"), "MADE_UP_GENE" + ) + self.assertEqual( + self.obj._get_protein_id_for_gene_name("ALDOC"), + "P09972;A0A024QZ64;A8MVZ9;B7Z3K9;B7Z1N6;B7Z3K7;J3KSV6;J3QKP5;C9J8F3;B7Z1Z9;J3QKK1;B7Z1H6;K7EKH5;B7Z1L5", + ) + self.assertEqual( + self.obj._get_protein_id_for_gene_name("FCGRT"), "P55899;M0R0A9;A0A024QZI2" + ) + + # def test_perform_gsea(self): + # df = self.obj.perform_gsea(column="disease", + # group1="healthy", + # group2="liver cirrhosis", + # gene_sets= 'KEGG_2019_Human') + + # cholersterol_enhanced = 'Cholesterol metabolism' in df.index.to_list() + # self.assertTrue(cholersterol_enhanced) + + +class TestDIANNDataSet(BaseTestDataSet.BaseTest): + def setUp(self): + self.loader = DIANNLoader(file="testfiles/diann/report_final.pg_matrix.tsv") + self.metadata_path = "testfiles/diann/metadata.xlsx" + self.obj = DataSet( + loader=self.loader, + metadata_path_or_df=self.metadata_path, + sample_column="analytical_sample external_id", + ) + # expected dimensions of matrix + self.matrix_dim = (20, 10) + self.matrix_dim_filtered = (20, 10) + self.comparison_column = "grouping1" + + def test_plot_intensity_violin(self): + # Violinplot + plot = self.obj.plot_intensity( + protein_id="A0A075B6H7", group="grouping1", method="violin" + ) + plot_dict = plot.to_plotly_json() + self.assertIsInstance(plot, plotly.graph_objects.Figure) + # are two groups plotted + self.assertEqual(len(plot_dict.get("data")), 2) + + def test_plot_intensity_box(self): + # Boxplot + plot = self.obj.plot_intensity( + protein_id="A0A075B6H7", group="grouping1", method="box", log_scale=True + ) + plot_dict = plot.to_plotly_json() + # log scale + self.assertEqual(plot_dict.get("layout").get("yaxis").get("type"), "log") + is_boxplot = "boxmode" in plot_dict.get("layout") + self.assertTrue(is_boxplot) + + def test_plot_intensity_scatter(self): + # Scatterplot + plot = self.obj.plot_intensity( + protein_id="A0A075B6H7", group="grouping1", method="scatter" + ) + plot_dict = plot.to_plotly_json() + self.assertIsInstance(plot, plotly.graph_objects.Figure) + # are two groups plotted + self.assertEqual(plot_dict.get("data")[0].get("type"), "scatter") + + def test_plot_intensity_wrong_method(self): + with self.assertRaises(ValueError): + self.obj.plot_intensity( + protein_id="A0A075B6H7", group="grouping1", method="wrong" + ) + + def test_plot_clustermap_noimputation(self): + # raises error when data is not imputed + with self.assertRaises(ValueError): + self.obj.plot_clustermap() + + def test_plot_dendrogram(self): + self.obj.preprocess(imputation="mean") + self.obj.plot_dendrogram() + + def test_plot_tsne(self): + plot_dict = self.obj.plot_tsne().to_plotly_json() + # check if everything get plotted + self.assertEqual(len(plot_dict.get("data")[0].get("x")), 20) + + def test_plot_dendrogram_navalues(self): + with self.assertRaises(ValueError): + self.obj.plot_dendrogram() + + def test_plot_dendrogram_not_imputed(self): + with self.assertRaises(ValueError): + self.obj.plot_dendrogram() + + def test_volcano_plot_anova(self): + self.obj.preprocess(imputation="knn", log2_transform=True) + plot = self.obj.plot_volcano( + column="grouping1", group1="Healthy", group2="Disease", method="anova" + ) + expected_y_value = 0.040890177695653236 + y_value = plot.to_plotly_json().get("data")[0].get("y")[1] + self.assertAlmostEqual(y_value, expected_y_value) + + def test_volcano_plot_ttest_no_column(self): + with self.assertRaises(ValueError): + self.obj.preprocess(imputation="knn") + self.obj.plot_volcano(group1="Healthy", group2="Disease", method="ttest") + + def test_volcano_plot_wrongmethod(self): + with self.assertRaises(ValueError): + self.obj.plot_volcano( + column="grouping1", + group1="Healthy", + group2="Disease", + method="wrongmethod", + ) + + # def test_diff_expression_analysis_with_list(self): + # self.obj.preprocess(imputation="knn") + # column="grouping1" + # group1="Healthy" + # group2="Disease" + # group1_samples = self.obj.metadata[self.obj.metadata[column] == group1][ + # "sample" + # ].tolist() + # group2_samples = self.obj.metadata[self.obj.metadata[column] == group2][ + # "sample" + # ].tolist() + # self.obj.diff_expression_analysis( + # group1=group1_samples, + # group2=group2_samples) + + +class TestFragPipeDataSet(BaseTestDataSet.BaseTest): + def setUp(self): + self.loader = FragPipeLoader( + file="testfiles/fragpipe/combined_proteins.tsv", + intensity_column="[sample] Razor Intensity", + ) + self.metadata_path = "testfiles/fragpipe/metadata.xlsx" + self.obj = DataSet( + loader=self.loader, + metadata_path_or_df=self.metadata_path, + sample_column="analytical_sample external_id", + ) + # expected dimensions of matrix + self.matrix_dim = (20, 10) + self.matrix_dim_filtered = (20, 10) + self.comparison_column = "grouping1" + + +class TestSpectronautDataSet(BaseTestDataSet.BaseTest): + @classmethod + def setUpClass(cls): + if not os.path.isfile("testfiles/spectronaut/results.tsv"): + shutil.unpack_archive( + "testfiles/spectronaut/results.tsv.zip", "testfiles/spectronaut/" + ) + + cls.cls_loader = SpectronautLoader(file="testfiles/spectronaut/results.tsv") + cls.cls_metadata_path = "testfiles/spectronaut/metadata.xlsx" + cls.cls_obj = DataSet( + loader=cls.cls_loader, + metadata_path_or_df=cls.cls_metadata_path, + sample_column="sample", + ) + + def setUp(self): + self.loader = copy.deepcopy(self.cls_loader) + self.metadata_path = copy.deepcopy(self.cls_metadata_path) + self.obj = copy.deepcopy(self.cls_obj) + self.matrix_dim = (9, 2458) + self.matrix_dim_filtered = (9, 2453) + self.comparison_column = "condition" + + @classmethod + def tearDownClass(cls): + if os.path.isdir("testfiles/spectronaut/__MACOSX"): + shutil.rmtree("testfiles/spectronaut/__MACOSX") + + os.remove("testfiles/spectronaut/results.tsv") + + +class TestGenericDataSet(BaseTestDataSet.BaseTest): + @classmethod + def setUpClass(cls): + if not os.path.isfile("testfiles/fragpipe/combined_proteins.tsv"): + shutil.unpack_archive( + "testfiles/fragpipe/combined_proteins.tsv.zip", "testfiles/fragpipe" + ) + + cls.cls_loader = GenericLoader( + file="testfiles/fragpipe/combined_proteins.tsv", + intensity_column=[ + "S1 Razor Intensity", + "S2 Razor Intensity", + "S3 Razor Intensity", + "S4 Razor Intensity", + "S5 Razor Intensity", + "S6 Razor Intensity", + "S7 Razor Intensity", + "S8 Razor Intensity", + ], + index_column="Protein", + sep="\t", + ) + cls.cls_metadata_path = "testfiles/fragpipe/metadata2.xlsx" + cls.cls_obj = DataSet( + loader=cls.cls_loader, + metadata_path_or_df=cls.cls_metadata_path, + sample_column="analytical_sample external_id", + ) + + def setUp(self): + self.loader = copy.deepcopy(self.cls_loader) + self.metadata_path = copy.deepcopy(self.cls_metadata_path) + self.obj = copy.deepcopy(self.cls_obj) + self.matrix_dim = (8, 10) + self.matrix_dim_filtered = (8, 10) + self.comparison_column = "grouping1" + + @classmethod + def tearDownClass(cls): + if os.path.isdir("testfiles/fragpipe/__MACOSX"): + shutil.rmtree("testfiles/fragpipe/__MACOSX") + + +class TestSyntheticDataSet(BaseTestDataSet.BaseTest): + @classmethod + def setUpClass(cls): + cls.cls_loader = GenericLoader( + file="testfiles/synthetic/preprocessing_pentests.csv", + intensity_column="Intensity [sample]", + index_column="Protein IDs", + ) + cls.cls_metadata_path = ( + "testfiles/synthetic/preprocessing_pentests_metadata.csv" + ) + cls.cls_obj = DataSet( + loader=cls.cls_loader, + metadata_path_or_df=cls.cls_metadata_path, + sample_column="sample", + ) + + def setUp(self): + self.loader = copy.deepcopy(self.cls_loader) + self.metadata_path = copy.deepcopy(self.cls_metadata_path) + self.obj = copy.deepcopy(self.cls_obj) + self.matrix_dim = (4, 20) + self.matrix_dim_filtered = (4, 20) + self.comparison_column = "groups" + + def test_preprocess_do_nothing(self): + """No preprocessing""" + self.obj.preprocess() + self.assertEqual(self.obj.mat.shape, self.matrix_dim) + self.assertEqual(np.isnan(self.obj.mat.values.flatten()).sum(), 8) + + def test_preprocess_drop_unmeasured_features(self): + """Remove one completely empty row""" + self.obj.preprocess(drop_unmeasured_features=True) + self.assertEqual(self.obj.mat.shape[1], 19) + self.assertEqual( + self.obj.preprocessing_info[ + PreprocessingStateKeys.DROP_UNMEASURED_FEATURES + ], + 1, + ) + + def test_preprocess_replace_zero(self): + """Replace zeros with NaNs, remove two rows, leave 8 nans""" + self.obj.preprocess(replace_zeroes=True, drop_unmeasured_features=True) + self.assertEqual(self.obj.mat.shape[1], 18) + self.assertEqual(np.isnan(self.obj.mat.values.flatten()).sum(), 8) + self.assertEqual( + self.obj.preprocessing_info[ + PreprocessingStateKeys.DROP_UNMEASURED_FEATURES + ], + 2, + ) + self.assertEqual( + self.obj.preprocessing_info[PreprocessingStateKeys.REPLACE_ZEROES], True + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_dataset_pathway.py b/tests/test_dataset_pathway.py new file mode 100644 index 00000000..f3fa7a90 --- /dev/null +++ b/tests/test_dataset_pathway.py @@ -0,0 +1,122 @@ +import unittest + +import numpy as np +import pandas as pd + +from alphastats.dataset.dataset import DataSet +from alphastats.dataset.pathway import Enrichment +from alphastats.loader.maxquant_loader import MaxQuantLoader + + +class BaseTestDataSet: + # parent class of test loader for common tests among loaders + # this is wrapped in a nested class so it doesnt get called separatly when testing + # plus to avoid multiple inheritance + class BaseTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.loader = None + self.obj = None + self.fg_sample = None + self.bg_sample = None + + @unittest.skip("TODO: decouple this from the GO API!") + def test_go_abundance_corretion(self): + df = self.obj.go_abundance_correction( + fg_sample=self.fg_sample, bg_sample=self.bg_sample + ) + self.assertFalse(df.empty) + + @unittest.skip("TODO: decouple this from the GO API!") + def test_plot_scatter(self): + df = self.obj.go_abundance_correction( + fg_sample=self.fg_sample, bg_sample=self.bg_sample + ) + plot_dict = df.plot_scatter().to_plotly_json() + # colored in 4 different categories but could change when DB are updated + self.assertTrue(len(plot_dict.get("data")) > 4) + + @unittest.skip("TODO: decouple this from the GO API!") + def test_plot_bar(self): + df = self.obj.go_abundance_correction( + fg_sample=self.fg_sample, bg_sample=self.bg_sample + ) + df.plot_scatter() + + @unittest.skip("TODO: decouple this from the GO API!") + def test_go_characterize_foreground(self): + df = self.obj.go_characterize_foreground( + tax_id=9606, protein_list=self.obj.mat.columns.to_list()[600:700] + ) + self.assertFalse(df.empty) + + @unittest.skip("TODO: decouple this from the GO API!") + def test_go_compare_samples(self): + df = self.obj.go_compare_samples( + fg_sample=self.fg_sample, bg_sample=self.bg_sample + ) + self.assertTrue(df.empty) + + @unittest.skip("TODO: decouple this from the GO API!") + def test_raise_error_no_evidence(self): + with self.assertRaises(ValueError): + self.obj.evidence_df = None + self.obj.go_abundance_correction( + fg_sample=self.fg_sample, bg_sample=self.bg_sample + ) + + @unittest.skip("TODO: decouple this from the GO API!") + def test_go_abundance_correction_with_list(self): + df = self.obj.go_abundance_correction( + bg_sample=self.bg_sample, + fg_protein_list=self.obj.mat.columns.to_list()[200:300], + ) + self.assertTrue(df.empty) + + @unittest.skip("TODO: decouple this from the GO API!") + def test_go_genome_list(self): + df = self.obj.go_genome( + protein_list=self.obj.mat.columns.to_list()[600:700] + ) + self.assertFalse(df.empty) + + @unittest.skip("TODO: decouple this from the GO API!") + def test_go_genome_sample(self): + df = self.obj.go_genome(fg_sample=self.fg_sample) + self.assertFalse(df.empty) + + def test_extract_protein_ids(self): + # test function with different entries + entry_one = "sp|P0DMV9|HS71B_HUMAN,sp|P0DMV8|HS71A_HUMAN" + entry_one_protein_id = Enrichment._extract_protein_ids(entry=entry_one) + self.assertEqual(entry_one_protein_id, "P0DMV9;P0DMV8") + + entry_two = "ENSEMBL:ENSBTAP00000007350" + entry_two_protein_id = Enrichment._extract_protein_ids(entry=entry_two) + self.assertEqual(entry_two_protein_id, "ENSBTAP00000007350") + + +class TestMaxQuantGODataSet(BaseTestDataSet.BaseTest): + def setUp(self): + self.loader = MaxQuantLoader( + file="testfiles/maxquant_go/proteinGroups.txt", + evidence_file="testfiles/maxquant_go/evidence.txt", + ) + evidence_df = pd.read_csv("testfiles/maxquant_go/evidence.txt", sep="\t") + metadata = pd.DataFrame({"sample": evidence_df["Raw file"].unique().tolist()}) + metadata["experiment"] = np.where( + metadata["sample"].str.startswith("AC"), "A", "U" + ) + + self.obj = DataSet( + loader=self.loader, + metadata_path_or_df=metadata, + sample_column="sample", + ) + self.fg_sample = "AC399" + self.bg_sample = "UT822" + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_gpt.py b/tests/test_gpt.py index 5d05d1fc..ca92c950 100644 --- a/tests/test_gpt.py +++ b/tests/test_gpt.py @@ -2,9 +2,9 @@ import unittest from unittest.mock import MagicMock, patch -from alphastats.DataSet import DataSet +from alphastats.dataset.dataset import DataSet from alphastats.llm.uniprot_utils import extract_data, get_uniprot_data -from alphastats.loader.MaxQuantLoader import MaxQuantLoader +from alphastats.loader.maxquant_loader import MaxQuantLoader logger = logging.getLogger(__name__) diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 36fe1138..ef0cd865 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -7,12 +7,12 @@ import pandas as pd -from alphastats.loader.AlphaPeptLoader import AlphaPeptLoader -from alphastats.loader.DIANNLoader import DIANNLoader -from alphastats.loader.FragPipeLoader import FragPipeLoader -from alphastats.loader.MaxQuantLoader import MaxQuantLoader -from alphastats.loader.mzTabLoader import mzTabLoader -from alphastats.loader.SpectronautLoader import SpectronautLoader +from alphastats.loader.alphapept_loader import AlphaPeptLoader +from alphastats.loader.diann_loader import DIANNLoader +from alphastats.loader.fragpipe_loader import FragPipeLoader +from alphastats.loader.maxquant_loader import MaxQuantLoader +from alphastats.loader.mztab_loader import mzTabLoader +from alphastats.loader.spectronaut_loader import SpectronautLoader logger = logging.getLogger(__name__)