-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
simplify-dataset-init #333
Merged
Merged
Changes from 4 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
842f418
reorder and simlify methods in DataSet constructor
mschwoer 3abea3e
introduce datasetfactory
mschwoer 5446d0e
use datasetfactory
mschwoer ac311c1
fix tests
mschwoer c5cce03
make dataset factory private
mschwoer 441002e
move dedicated "preprocessed" flag to dict
mschwoer 02b4da2
Merge branch 'development' into simplify-dataset-init
mschwoer File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from typing import List, Union, Dict, Optional | ||
from typing import List, Union, Dict, Optional, Tuple | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
@@ -20,6 +20,7 @@ | |
from alphastats.plots.IntensityPlot import IntensityPlot | ||
from alphastats.plots.SampleHistogram import SampleHistogram | ||
from alphastats.plots.VolcanoPlot import VolcanoPlot | ||
from alphastats.dataset_factory import DataSetFactory | ||
|
||
plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( | ||
layout=plotly.graph_objects.Layout( | ||
|
@@ -48,14 +49,14 @@ class DataSet: | |
def __init__( | ||
self, | ||
loader: BaseLoader, | ||
metadata_path: Optional[str] = None, | ||
metadata_path: Optional[Union[str, pd.DataFrame]] = None, | ||
sample_column: Optional[str] = None, | ||
): | ||
"""Create DataSet | ||
|
||
Args: | ||
loader (_type_): loader of class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader | ||
metadata_path (str, optional): path to metadata file. Defaults to None. | ||
metadata_path (str or pd.DataFrame, optional): path to metadata file or an actual df. Defaults to None. | ||
sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None. | ||
|
||
Attributes of a DataSet instance: | ||
|
@@ -66,47 +67,81 @@ def __init__( | |
""" | ||
self._check_loader(loader=loader) | ||
|
||
# fill data from loader | ||
self.rawinput: pd.DataFrame = loader.rawinput | ||
self.software: str = loader.software | ||
self.index_column: str = loader.index_column | ||
self.intensity_column: Union[str, list] = loader.intensity_column | ||
self.filter_columns: List[str] = loader.filter_columns | ||
self.evidence_df: pd.DataFrame = loader.evidence_df | ||
self.gene_names: str = loader.gene_names | ||
|
||
# include filtering before | ||
self._create_matrix() | ||
self._check_matrix_values() | ||
|
||
self.metadata: pd.DataFrame | ||
self.sample: str | ||
if metadata_path is not None: | ||
self.sample = sample_column | ||
self.metadata = self._load_metadata(file_path=metadata_path) | ||
self._remove_misc_samples_in_metadata() | ||
else: | ||
self.sample = "sample" | ||
self.metadata = pd.DataFrame({"sample": list(self.mat.index)}) | ||
|
||
if loader == "Generic": | ||
intensity_column = loader._extract_sample_names( | ||
self.index_column: str = loader.index_column | ||
self.software: str = loader.software | ||
self._gene_names: str = loader.gene_names | ||
|
||
self._intensity_column: Union[str, list] = ( | ||
loader._extract_sample_names( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note that this was different before: |
||
metadata=self.metadata, sample_column=self.sample | ||
) | ||
self.intensity_column = intensity_column | ||
if loader == "Generic" | ||
else loader.intensity_column | ||
) | ||
|
||
# init preprocessing settings | ||
self.preprocessing_info: Dict = Preprocess.init_preprocessing_info( | ||
num_samples=self.mat.shape[0], | ||
num_protein_groups=self.mat.shape[1], | ||
intensity_column=self.intensity_column, | ||
filter_columns=self.filter_columns, | ||
# self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused | ||
|
||
self.dataset_factory = DataSetFactory( | ||
rawinput=self.rawinput, | ||
index_column=self.index_column, | ||
intensity_column=self._intensity_column, | ||
metadata_path=metadata_path, | ||
sample_column=sample_column, | ||
) | ||
|
||
self.preprocessed = False | ||
self.preprocessed: bool = False | ||
rawmat, mat, metadata, sample, preprocessing_info, preprocessed = ( | ||
self._init_dataset() | ||
) | ||
self.rawmat: pd.DataFrame = rawmat | ||
self.mat: pd.DataFrame = mat | ||
self.metadata: pd.DataFrame = metadata | ||
self.sample: str = sample | ||
self.preprocessing_info: Dict = preprocessing_info | ||
self._preprocessed: bool = preprocessed | ||
|
||
print("DataSet has been created.") | ||
|
||
def _init_dataset(self): | ||
rawmat, mat = self.dataset_factory.create_matrix_from_rawinput() | ||
|
||
metadata, sample = self.dataset_factory.create_metadata(mat) | ||
|
||
preprocessing_info = Preprocess.init_preprocessing_info( | ||
num_samples=mat.shape[0], | ||
num_protein_groups=mat.shape[1], | ||
intensity_column=self._intensity_column, | ||
filter_columns=self.filter_columns, | ||
) | ||
|
||
preprocessed = False # TODO could be moved to preprocessing_info dict | ||
|
||
return rawmat, mat, metadata, sample, preprocessing_info, preprocessed | ||
|
||
def _check_loader(self, loader): | ||
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader | ||
|
||
Args: | ||
loader : loader | ||
""" | ||
if not isinstance(loader, BaseLoader): | ||
raise LoaderError( | ||
"loader must be a subclass of BaseLoader, " | ||
f"got {loader.__class__.__name__}" | ||
) | ||
|
||
if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty: | ||
raise ValueError( | ||
"Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" | ||
) | ||
|
||
if not isinstance(loader.index_column, str): | ||
raise ValueError( | ||
"Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" | ||
) | ||
|
||
def _get_preprocess(self) -> Preprocess: | ||
"""Return instance of the Preprocess object.""" | ||
return Preprocess( | ||
|
@@ -143,21 +178,18 @@ def preprocess( | |
**kwargs, | ||
) | ||
) | ||
self.preprocessed = True | ||
self._preprocessed = True | ||
|
||
def reset_preprocessing(self): | ||
"""Reset all preprocessing steps""" | ||
self._create_matrix() | ||
self.preprocessing_info = Preprocess.init_preprocessing_info( | ||
num_samples=self.mat.shape[0], | ||
num_protein_groups=self.mat.shape[1], | ||
intensity_column=self.intensity_column, | ||
filter_columns=self.filter_columns, | ||
) | ||
|
||
self.preprocessed = False | ||
# TODO fix bug: metadata is not reset/reloaded here | ||
print("All preprocessing steps are reset.") | ||
( | ||
self.rawmat, | ||
self.mat, | ||
self.metadata, | ||
self.sample, | ||
self.preprocessing_info, | ||
self._preprocessed, | ||
) = self._init_dataset() | ||
|
||
def batch_correction(self, batch: str) -> None: | ||
"""A wrapper for Preprocess.batch_correction(), see documentation there.""" | ||
|
@@ -340,7 +372,7 @@ def plot_volcano( | |
metadata=self.metadata, | ||
sample=self.sample, | ||
index_column=self.index_column, | ||
gene_names=self.gene_names, | ||
gene_names=self._gene_names, | ||
preprocessing_info=self.preprocessing_info, | ||
group1=group1, | ||
group2=group2, | ||
|
@@ -392,7 +424,7 @@ def plot_intensity( | |
mat=self.mat, | ||
metadata=self.metadata, | ||
sample=self.sample, | ||
intensity_column=self.intensity_column, | ||
intensity_column=self._intensity_column, | ||
preprocessing_info=self.preprocessing_info, | ||
protein_id=protein_id, | ||
group=group, | ||
|
@@ -477,110 +509,3 @@ def plot_dendrogram( | |
): | ||
"""A wrapper for Plot.plot_dendrogram(), see documentation there.""" | ||
return self._get_plot().plot_dendrogram(linkagefun) | ||
|
||
def _check_loader(self, loader): | ||
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader | ||
|
||
Args: | ||
loader : loader | ||
""" | ||
if not isinstance(loader, BaseLoader): | ||
raise LoaderError( | ||
"loader must be a subclass of BaseLoader, " | ||
f"got {loader.__class__.__name__}" | ||
) | ||
|
||
if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty: | ||
raise ValueError( | ||
"Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" | ||
) | ||
|
||
if not isinstance(loader.index_column, str): | ||
raise ValueError( | ||
"Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" | ||
) | ||
|
||
def _check_matrix_values(self): | ||
if np.isinf(self.mat).values.sum() > 0: | ||
logging.warning("Data contains infinite values.") | ||
|
||
def _remove_misc_samples_in_metadata(self): | ||
samples_matrix = self.mat.index.to_list() | ||
samples_metadata = self.metadata[self.sample].to_list() | ||
misc_samples = list(set(samples_metadata) - set(samples_matrix)) | ||
if len(misc_samples) > 0: | ||
self.metadata = self.metadata[ | ||
~self.metadata[self.sample].isin(misc_samples) | ||
] | ||
logging.warning( | ||
f"{misc_samples} are not described in the protein data and" | ||
"are removed from the metadata." | ||
) | ||
|
||
def _create_matrix(self): | ||
""" | ||
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and | ||
rows the samples. | ||
""" | ||
|
||
df = self.rawinput | ||
df = df.set_index(self.index_column) | ||
|
||
if isinstance(self.intensity_column, str): | ||
regex_find_intensity_columns = self.intensity_column.replace( | ||
"[sample]", ".*" | ||
) | ||
df = df.filter(regex=(regex_find_intensity_columns), axis=1) | ||
# remove Intensity so only sample names remain | ||
substring_to_remove = regex_find_intensity_columns.replace(".*", "") | ||
df.columns = df.columns.str.replace(substring_to_remove, "") | ||
|
||
else: | ||
df = df[self.intensity_column] | ||
|
||
# transpose dataframe | ||
mat = df.transpose() | ||
mat.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
self.rawmat = mat | ||
|
||
# remove proteins with only zero # TODO this is re-done in preprocessing | ||
mat_no_zeros = mat.loc[:, (mat != 0).any(axis=0)] | ||
self.mat = mat_no_zeros.astype(float) | ||
|
||
def _load_metadata( | ||
self, file_path: Union[pd.DataFrame, str] | ||
) -> Optional[pd.DataFrame]: | ||
"""Load metadata either xlsx, txt, csv or txt file | ||
|
||
Args: | ||
file_path: path to metadata file or metadata DataFrame # TODO disentangle this | ||
""" | ||
if isinstance(file_path, pd.DataFrame): | ||
df = file_path | ||
elif file_path.endswith(".xlsx"): | ||
warnings.filterwarnings( | ||
"ignore", | ||
category=UserWarning, | ||
module="openpyxl", | ||
# message=r"/extension is not supported and will be removed/", # this somehow does not work here? | ||
) | ||
df = pd.read_excel(file_path) | ||
# find robust way to detect file format | ||
# else give file separation as variable | ||
elif file_path.endswith(".txt") or file_path.endswith(".tsv"): | ||
df = pd.read_csv(file_path, delimiter="\t") | ||
elif file_path.endswith(".csv"): | ||
df = pd.read_csv(file_path) | ||
else: | ||
logging.warning( | ||
"WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file" | ||
) | ||
return None | ||
|
||
if df is not None and self.sample not in df.columns: | ||
logging.error(f"sample_column: {self.sample} not found in {file_path}") | ||
|
||
# check whether sample labeling matches protein data | ||
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data") | ||
df.columns = df.columns.astype(str) | ||
return df |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry to be this person hahaha
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is indeed a good suggestion to make this api better .. in the long run, I'd implement two distinct parameters (
path
anddf
) .. did the whole change here: #336