Skip to content

Commit

Permalink
generic loader
Browse files Browse the repository at this point in the history
  • Loading branch information
elena-krismer committed May 19, 2023
1 parent c059aa6 commit 428f4fb
Show file tree
Hide file tree
Showing 9 changed files with 148 additions and 30 deletions.
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* ENH download metadata template in the GUI
* ENH multicova analysis
* ENH filter data completeness `dataset.preprocess(data_completeness=0.7)`
* ADD `GenericLoader` for not supported data formats

# 0.5.4
* FIX altair version - binning of streamlit version
Expand Down
7 changes: 7 additions & 0 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from alphastats.loader.FragPipeLoader import FragPipeLoader
from alphastats.loader.MaxQuantLoader import MaxQuantLoader
from alphastats.loader.SpectronautLoader import SpectronautLoader
from alphastats.loader.GenericLoader import GenericLoader

from alphastats.DataSet_Plot import Plot
from alphastats.DataSet_Preprocess import Preprocess
Expand Down Expand Up @@ -69,13 +70,18 @@ def __init__(self, loader, metadata_path=None, sample_column=None):
self.create_matrix()
self._check_matrix_values()
self.metadata = None

if metadata_path is not None:
self.sample = sample_column
self.load_metadata(file_path=metadata_path)
self._remove_misc_samples_in_metadata()

else:
self._create_metadata()

if self.loader == "Generic":
intensity_column = loader._extract_sample_names(metadata=self.metadata, sample_column=self.sample)
self.intensity_column = intensity_column

# save preprocessing settings
self.preprocessing_info = self._save_dataset_info()
Expand Down Expand Up @@ -103,6 +109,7 @@ def _check_loader(self, loader):
DIANNLoader,
FragPipeLoader,
SpectronautLoader,
GenericLoader
),
):
raise LoaderError(
Expand Down
1 change: 1 addition & 0 deletions alphastats/DataSet_Statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ def ancova(self, protein_id:str, covar: Union[str, list], between:str) -> pd.Dat
ancova_df = pingouin.ancova(df, dv=protein_id, covar=covar, between=between)
return ancova_df

@ignore_warning(RuntimeWarning)
def multicova_analysis(
self,
covariates: list,
Expand Down
52 changes: 38 additions & 14 deletions alphastats/gui/pages/02_Import Data.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,17 @@ def check_software_file(df, software):


def print_software_import_info(software):
import_file = software_options.get(software).get("import_file")
string_output = f"Please upload {import_file} file from {software}."
if software != "Other":
import_file = software_options.get(software).get("import_file")
string_output = f"Please upload {import_file} file from {software}."

else:
string_output = f"Please upload your proteomics file."

return string_output


def select_columns_for_loaders(software):
def select_columns_for_loaders(software, software_df:None):
"""
select intensity and index column depending on software
will be saved in session state
Expand All @@ -98,19 +103,37 @@ def select_columns_for_loaders(software):
st.markdown("### 2. Select columns used for further analysis.")
st.markdown("Select intensity columns for further analysis")

st.selectbox(
"Intensity Column",
options=software_options.get(software).get("intensity_column"),
key="intensity_column",
)
if software != "Other":

st.selectbox(
"Intensity Column",
options=software_options.get(software).get("intensity_column"),
key="intensity_column",
)

st.markdown("Select index column (with ProteinGroups) for further analysis")
st.markdown("Select index column (with ProteinGroups) for further analysis")

st.selectbox(
"Index Column",
options=software_options.get(software).get("index_column"),

key="index_column",
)
else:
st.selectbox(
"Intensity Columns",
options=software_df.columns.to_list(),
key="intensity_column",
)

st.markdown("Select index column (with ProteinGroups) for further analysis")

st.selectbox(
"Index Column",
options=software_df.columns.to_list(),
key="index_column",
)

st.selectbox(
"Index Column",
options=software_options.get(software).get("index_column"),
key="index_column",
)


def load_proteomics_data(uploaded_file, intensity_column, index_column, software):
Expand Down Expand Up @@ -296,6 +319,7 @@ def import_data():
"DIANN",
"Fragpipe",
"Spectronaut",
"Other",
],

)
Expand Down
8 changes: 4 additions & 4 deletions alphastats/loader/BaseLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@
import numpy as np
from alphastats.utils import find_duplicates_in_list
import pkg_resources

from typing import Union

class BaseLoader:
"""Parent class of Loaders"""

def __init__(self, file, intensity_column, index_column, sep):
"""BaseLoader for AlphaPept, MaxQuant, Fragpipe and DIANNLoader
def __init__(self, file:Union[str, pd.DataFrame], intensity_column:str, index_column:str, sep:str):
"""BaseLoader for AlphaPept, MaxQuant, Fragpipe, Spectronau and DIANNLoader
Args:
file_path (str): path to file
sep (str, optional): file separation. Defaults to "\t".
"""

# self._check_if_file_exists(file=file)

if isinstance(file, pd.DataFrame):
self.rawinput = file
else:
Expand Down
13 changes: 7 additions & 6 deletions alphastats/loader/FragPipeLoader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from alphastats.loader.BaseLoader import BaseLoader
import pandas as pd
from typing import Union

# Philosopher
# class name needs to be discussed whether MSFragger/Fragpipe/Philospher
Expand All @@ -10,12 +11,12 @@ class FragPipeLoader(BaseLoader):

def __init__(
self,
file,
intensity_column="[sample] MaxLFQ Intensity ",
index_column="Protein",
gene_names_column="Gene Names",
confidence_column="Protein Probability",
sep="\t",
file:Union[str, pd.DataFrame],
intensity_column:str="[sample] MaxLFQ Intensity ",
index_column:str="Protein",
gene_names_column:str="Gene Names",
confidence_column:str="Protein Probability",
sep:str="\t",
**kwargs
):

Expand Down
44 changes: 44 additions & 0 deletions alphastats/loader/GenericLoader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

from alphastats.loader.BaseLoader import BaseLoader

import pandas as pd
from typing import Union

class GenericLoader(BaseLoader):
def __init__(self, file:Union[str, pd.DataFrame], intensity_column:list, index_column:str, sep:str):
"""Generic Loader for you proteomics data
Args:
file (Union[str, pd.DataFrame]): path to your proteomics file or pandas.DataFrame
intensity_column (list): list of samples with intensity
index_column (str): column with Protein IDs or Gene names, used for indexing
sep (str): file separation
"""
if isinstance(file, pd.DataFrame):
self.rawinput = file
else:
self.rawinput = pd.read_csv(file, sep=sep, low_memory=False)
self.intensity_column = "[sample]"
self.intensity_column_list = intensity_column
self.index_column = index_column
self.filter_columns = []
self.confidence_column = None
self.software = "Generic"
self.evidence_df = None
self.gene_names = None
self.ptm_df = None
self._add_contamination_column()
self._check_if_columns_are_present()
self._read_all_columns_as_string()

def _extract_sample_names(self, metadata:pd.DataFrame, sample_column:str):
sample_names = metadata[sample_column].to_list()

for intensity_column in self.intensity_column_list:
for sample in sample_names:
if sample in intensity_column:
sample_structure = intensity_column.replace(sample, "[sample]")

self.intensity_column = sample_structure
return sample_structure

12 changes: 6 additions & 6 deletions alphastats/loader/MaxQuantLoader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ class MaxQuantLoader(BaseLoader):
def __init__(
self,
file,
intensity_column="LFQ intensity [sample]",
index_column="Protein IDs",
gene_names_column="Gene names",
filter_columns=["Only identified by site", "Reverse", "Potential contaminant"],
confidence_column="Q-value",
intensity_column:str="LFQ intensity [sample]",
index_column:str="Protein IDs",
gene_names_column:str="Gene names",
filter_columns:list=["Only identified by site", "Reverse", "Potential contaminant"],
confidence_column:str="Q-value",
evidence_file=None,
sep="\t",
sep:str="\t",
**kwargs
):
"""Loader MaxQuant output
Expand Down
40 changes: 40 additions & 0 deletions tests/test_DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from alphastats.loader.AlphaPeptLoader import AlphaPeptLoader
from alphastats.loader.FragPipeLoader import FragPipeLoader
from alphastats.loader.SpectronautLoader import SpectronautLoader
from alphastats.loader.GenericLoader import GenericLoader
from alphastats.DataSet import DataSet

from alphastats.DataSet_Statistics import Statistics
Expand Down Expand Up @@ -899,6 +900,45 @@ def tearDownClass(cls):
shutil.rmtree("testfiles/spectronaut/__MACOSX")

os.remove("testfiles/spectronaut/results.tsv")

class TestGenericDataSet(BaseTestDataSet.BaseTest):
@classmethod
def setUpClass(cls):
if os.path.isfile("testfiles/fragpipe/combined_proteins.tsv") == False:
shutil.unpack_archive(
"testfiles/fragpipe/combined_proteins.tsv.zip", "testfiles/fragpipe"
)

cls.cls_loader = GenericLoader(
file="testfiles/fragpipe/combined_proteins.tsv",
intensity_column=[
"S1 Razor Intensity", "S2 Razor Intensity", "S3 Razor Intensity",
"S4 Razor Intensity", "S5 Razor Intensity", "S6 Razor Intensity",
"S7 Razor Intensity", "S8 Razor Intensity"
],
index_column="Protein",
)
cls.cls_metadata_path = "testfiles/fragpipe/metadata.xlsx"
cls.cls_obj = DataSet(
loader=cls.cls_loader,
metadata_path=cls.cls_metadata_path,
sample_column="analytical_sample external_id",
)

def setUp(self):
self.loader = copy.deepcopy(self.cls_loader)
self.metadata_path = copy.deepcopy(self.cls_metadata_path)
self.obj = copy.deepcopy(self.cls_obj)
self.matrix_dim = (8, 6)
self.matrix_dim_filtered = (8, 6)
self.comparison_column = "grouping1"

@classmethod
def tearDownClass(cls):
if os.path.isdir("testfiles/fragpipe/__MACOSX"):
shutil.rmtree("testfiles/fragpipe/__MACOSX")

os.remove("testfiles/fragpipe/results.tsv")


if __name__ == "__main__":
Expand Down

0 comments on commit 428f4fb

Please sign in to comment.