Skip to content

Commit

Permalink
Merge pull request #232 from MannLabs/refactor_readers_I
Browse files Browse the repository at this point in the history
Refactor readers i
  • Loading branch information
mschwoer authored Nov 18, 2024
2 parents 42646da + a711aff commit 7cf2f55
Show file tree
Hide file tree
Showing 11 changed files with 282 additions and 143 deletions.
1 change: 1 addition & 0 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ serialize =
[bumpversion:file:./alphabase/__init__.py]

[bumpversion:file:./docs/conf.py]

search = {current_version}
replace = {new_version}
23 changes: 12 additions & 11 deletions alphabase/psm_reader/alphapept_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -79,31 +80,31 @@ def _load_file(self, filename):
with h5py.File(filename, "r") as _hdf:
dataset = _hdf[self.hdf_dataset]
df = pd.DataFrame({col: dataset[col] for col in dataset})
df["raw_name"] = os.path.basename(filename)[: -len(".ms_data.hdf")]
df[PsmDfCols.RAW_NAME] = os.path.basename(filename)[: -len(".ms_data.hdf")]
df["precursor"] = df["precursor"].str.decode("utf-8")
# df['naked_sequence'] = df['naked_sequence'].str.decode('utf-8')
if "scan_no" in df.columns:
df["scan_no"] = df["scan_no"].astype("int")
df["raw_idx"] = df["scan_no"] - 1 # if thermo, use scan-1 as spec_idx
df["charge"] = df["charge"].astype(int)
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(int)
return df

def _load_modifications(self, df: pd.DataFrame):
if len(df) == 0:
self._psm_df["sequence"] = ""
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df["decoy"] = 0
self._psm_df[PsmDfCols.SEQUENCE] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.DECOY] = 0
return

(
self._psm_df["sequence"],
self._psm_df["mods"],
self._psm_df["mod_sites"],
self._psm_df[PsmDfCols.SEQUENCE],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
_charges,
self._psm_df["decoy"],
self._psm_df[PsmDfCols.DECOY],
) = zip(*df["precursor"].apply(parse_ap))
self._psm_df.decoy = self._psm_df.decoy.astype(np.int8)
self._psm_df[PsmDfCols.DECOY] = self._psm_df[PsmDfCols.DECOY].astype(np.int8)


def register_readers():
Expand Down
13 changes: 8 additions & 5 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd

from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.maxquant_reader import MaxQuantReader
from alphabase.psm_reader.psm_reader import psm_reader_provider, psm_reader_yaml

Expand Down Expand Up @@ -127,7 +128,9 @@ def _load_file(self, filename):

def _post_process(self, origin_df: pd.DataFrame):
super()._post_process(origin_df)
self._psm_df.rename(columns={"spec_idx": "diann_spec_idx"}, inplace=True)
self._psm_df.rename(
columns={PsmDfCols.SPEC_IDX: PsmDfCols.DIANN_SPEC_INDEX}, inplace=True
)


class SpectronautReportReader(MaxQuantReader):
Expand Down Expand Up @@ -174,10 +177,10 @@ def _load_file(self, filename):
self.mod_seq_column = "ModifiedSequence"
self.csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=self.csv_sep, keep_default_na=False)
df[[self.mod_seq_column, "charge"]] = df[self.precursor_column].str.split(
".", expand=True, n=2
)
df["charge"] = df.charge.astype(np.int8)
df[[self.mod_seq_column, PsmDfCols.CHARGE]] = df[
self.precursor_column
].str.split(".", expand=True, n=2)
df[PsmDfCols.CHARGE] = df[PsmDfCols.CHARGE].astype(np.int8)
return df


Expand Down
71 changes: 71 additions & 0 deletions alphabase/psm_reader/keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
class ConstantsClass(type):
"""A metaclass for classes that should only contain string constants."""

def __setattr__(self, name, value):
raise TypeError("Constants class cannot be modified")

def get_values(cls):
"""Get all user-defined string values of the class."""
return [
value
for key, value in cls.__dict__.items()
if not key.startswith("__") and isinstance(value, str)
]


class PsmDfCols(metaclass=ConstantsClass):
"""Constants for accessing the columns of a PSM dataframe."""

# TODO: these are used only in th psm_reader package and the spectral_library.reader module so far
MOD_SITES = "mod_sites"
MODIFIED_SEQUENCE = "modified_sequence"
SEQUENCE = "sequence"
DECOY = "decoy"
MODS = "mods"
SCORE = "score"
TO_REMOVE = "to_remove"
AA_MASS_DIFFS = "aa_mass_diffs"
AA_MASS_DIFF_SITES = "aa_mass_diff_sites"
RT = "rt"
RT_START = "rt_start"
RT_STOP = "rt_stop"
RT_NORM = "rt_norm"
SPEC_IDX = "spec_idx"
SCANNR = "scannr"
FDR = "fdr"
NAA = "nAA"
CCS = "ccs"
MOBILITY = "mobility"
PEPTIDE_FDR = "peptide_fdr"
PROTEIN_FDR = "protein_fdr"

RAW_NAME = "raw_name"
CHARGE = "charge"
PROTEINS = "proteins"

SCAN_NUM = "scan_num"
PRECURSOR_MZ = "precursor_mz"
DIANN_SPEC_INDEX = "diann_spec_idx"

# part of the output, but not directly referenced
_UNIPROT_IDS = "uniprot_ids"
_GENES = "genes"
_QUERY_ID = "query_id"

# part of psm_reader_yaml, but not directly referenced
_INTENSITY = "intensity"


class LibPsmDfCols(metaclass=ConstantsClass):
"""Constants for accessing the columns of a Library PSM dataframe."""

FRAG_START_IDX = "frag_start_idx"
FRAG_STOP_IDX = "frag_stop_idx"

# not referenced in reader classes
FRAGMENT_INTENSITY = "fragment_intensity"
FRAGMENT_MZ = "fragment_mz"
FRAGMENT_TYPE = "fragment_type"
FRAGMENT_CHARGE = "fragment_charge"
FRAGMENT_SERIES = "fragment_series"
FRAGMENT_LOSS_TYPE = "fragment_loss_type"
16 changes: 10 additions & 6 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import copy
import warnings
from typing import Optional

import numba
import numpy as np
import pandas as pd

from alphabase.constants.modification import MOD_DF
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -195,7 +197,7 @@ def _init_modification_mapping(self):
psm_reader_yaml["maxquant"]["modification_mapping"]
)

def set_modification_mapping(self, modification_mapping: dict):
def set_modification_mapping(self, modification_mapping: Optional[dict] = None):
super().set_modification_mapping(modification_mapping)
self._add_all_unimod()
self._extend_mod_brackets()
Expand Down Expand Up @@ -237,8 +239,10 @@ def _extend_mod_brackets(self):
self.modification_mapping[key] = list(mod_set)

def _translate_decoy(self, origin_df=None):
if "decoy" in self._psm_df.columns:
self._psm_df.decoy = (self._psm_df.decoy == "-").astype(np.int8)
if PsmDfCols.DECOY in self._psm_df.columns:
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.DECOY] == "-"
).astype(np.int8)

def _init_column_mapping(self):
self.column_mapping = psm_reader_yaml["maxquant"]["column_mapping"]
Expand Down Expand Up @@ -278,15 +282,15 @@ def _load_modifications(self, origin_df: pd.DataFrame):
else:
mod_sep = "()"

(seqs, self._psm_df["mods"], self._psm_df["mod_sites"]) = zip(
(seqs, self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*origin_df[self.mod_seq_column].apply(
parse_mod_seq,
mod_sep=mod_sep,
fixed_C57=self.fixed_C57,
)
)
if "sequence" not in self._psm_df.columns:
self._psm_df["sequence"] = seqs
if PsmDfCols.SEQUENCE not in self._psm_df.columns:
self._psm_df[PsmDfCols.SEQUENCE] = seqs


def register_readers():
Expand Down
51 changes: 28 additions & 23 deletions alphabase/psm_reader/msfragger_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from alphabase.constants.aa import AA_ASCII_MASS
from alphabase.constants.atom import MASS_H, MASS_O
from alphabase.constants.modification import MOD_MASS
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -115,9 +116,6 @@ def __init__(
def _init_column_mapping(self):
self.column_mapping = psm_reader_yaml["msfragger_pepxml"]["column_mapping"]

def _init_modification_mapping(self):
self.modification_mapping = {}

def _translate_modifications(self):
pass

Expand All @@ -126,54 +124,61 @@ def _load_file(self, filename):
msf_df.fillna("", inplace=True)
if "ion_mobility" in msf_df.columns:
msf_df["ion_mobility"] = msf_df.ion_mobility.astype(float)
msf_df["raw_name"] = msf_df["spectrum"].str.split(".").apply(lambda x: x[0])
msf_df["to_remove"] = 0
self.column_mapping["to_remove"] = "to_remove"
msf_df[PsmDfCols.RAW_NAME] = (
msf_df["spectrum"].str.split(".").apply(lambda x: x[0])
)
msf_df["to_remove"] = 0 # TODO revisit
self.column_mapping[PsmDfCols.TO_REMOVE] = "to_remove"
return msf_df

def _translate_decoy(self, origin_df=None):
self._psm_df["decoy"] = self._psm_df.proteins.apply(_is_fragger_decoy).astype(
np.int8
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.PROTEINS].apply(_is_fragger_decoy).astype(np.int8)
)

self._psm_df.proteins = self._psm_df.proteins.apply(lambda x: ";".join(x))
self._psm_df[PsmDfCols.PROTEINS] = self._psm_df[PsmDfCols.PROTEINS].apply(
lambda x: ";".join(x)
)
if not self._keep_decoy:
self._psm_df["to_remove"] += self._psm_df.decoy > 0
self._psm_df[PsmDfCols.TO_REMOVE] += self._psm_df[PsmDfCols.DECOY] > 0

def _translate_score(self, origin_df=None):
# evalue score
self._psm_df["score"] = -np.log(self._psm_df["score"] + 1e-100)
self._psm_df[PsmDfCols.SCORE] = -np.log(self._psm_df[PsmDfCols.SCORE] + 1e-100)

def _load_modifications(self, msf_df):
if len(msf_df) == 0:
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df["aa_mass_diffs"] = ""
self._psm_df["aa_mass_diff_sites"] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFFS] = ""
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES] = ""
return

(
self._psm_df["mods"],
self._psm_df["mod_sites"],
self._psm_df["aa_mass_diffs"],
self._psm_df["aa_mass_diff_sites"],
self._psm_df[PsmDfCols.MODS],
self._psm_df[PsmDfCols.MOD_SITES],
self._psm_df[PsmDfCols.AA_MASS_DIFFS],
self._psm_df[PsmDfCols.AA_MASS_DIFF_SITES],
) = zip(
*msf_df[["peptide", "modifications"]].apply(
lambda x: _get_mods_from_masses(*x), axis=1
)
)

if not self.keep_unknown_aa_mass_diffs:
self._psm_df["to_remove"] += self._psm_df.aa_mass_diffs != ""
self._psm_df[PsmDfCols.TO_REMOVE] += (
self._psm_df[PsmDfCols.AA_MASS_DIFFS] != ""
)
self._psm_df.drop(
columns=["aa_mass_diffs", "aa_mass_diff_sites"], inplace=True
columns=[PsmDfCols.AA_MASS_DIFFS, PsmDfCols.AA_MASS_DIFF_SITES],
inplace=True,
)

def _post_process(self, origin_df: pd.DataFrame):
super()._post_process(origin_df)
self._psm_df = (
self._psm_df.query("to_remove==0")
.drop(columns="to_remove")
self._psm_df.query(f"{PsmDfCols.TO_REMOVE}==0")
.drop(columns=PsmDfCols.TO_REMOVE)
.reset_index(drop=True)
)

Expand Down
24 changes: 14 additions & 10 deletions alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd

import alphabase.constants.modification as ap_mod
from alphabase.psm_reader.keys import PsmDfCols
from alphabase.psm_reader.psm_reader import (
PSMReaderBase,
psm_reader_provider,
Expand Down Expand Up @@ -104,9 +105,6 @@ def __init__(
def _init_column_mapping(self):
self.column_mapping = psm_reader_yaml["pfind"]["column_mapping"]

def _init_modification_mapping(self):
self.modification_mapping = {}

def _translate_modifications(self):
pass

Expand All @@ -116,29 +114,35 @@ def _load_file(self, filename):
)
pfind_df.fillna("", inplace=True)
pfind_df = pfind_df[pfind_df.Sequence != ""]
pfind_df["raw_name"] = (
pfind_df[PsmDfCols.RAW_NAME] = (
pfind_df["File_Name"].str.split(".").apply(lambda x: x[0])
)
pfind_df["Proteins"] = pfind_df["Proteins"].apply(parse_pfind_protein)
return pfind_df

def _translate_decoy(self, origin_df=None):
self._psm_df.decoy = (self._psm_df.decoy == "decoy").astype(np.int8)
self._psm_df[PsmDfCols.DECOY] = (
self._psm_df[PsmDfCols.DECOY] == "decoy"
).astype(np.int8)

def _translate_score(self, origin_df=None):
self._psm_df.score = -np.log(self._psm_df.score.astype(float) + 1e-100)
self._psm_df[PsmDfCols.SCORE] = -np.log(
self._psm_df[PsmDfCols.SCORE].astype(float) + 1e-100
)

def _load_modifications(self, pfind_df):
if len(pfind_df) == 0:
self._psm_df["mods"] = ""
self._psm_df["mod_sites"] = ""
self._psm_df[PsmDfCols.MODS] = ""
self._psm_df[PsmDfCols.MOD_SITES] = ""
return

(self._psm_df["mods"], self._psm_df["mod_sites"]) = zip(
(self._psm_df[PsmDfCols.MODS], self._psm_df[PsmDfCols.MOD_SITES]) = zip(
*pfind_df["Modification"].apply(get_pFind_mods)
)

self._psm_df["mods"] = self._psm_df["mods"].apply(translate_pFind_mod)
self._psm_df[PsmDfCols.MODS] = self._psm_df[PsmDfCols.MODS].apply(
translate_pFind_mod
)


def register_readers():
Expand Down
Loading

0 comments on commit 7cf2f55

Please sign in to comment.