From bf480fffc9e44e15a493202f5e50bf9940c4dc4b Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 16 Jan 2025 13:19:58 +0100 Subject: [PATCH 01/24] add logreg and two step classifer --- alphadia/fdrexperimental.py | 483 ++++++++++++++++++++++++++++++++++++ 1 file changed, 483 insertions(+) diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index c35459a9..e6f72d59 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -7,12 +7,135 @@ # alpha family imports # third party imports import numpy as np +import pandas as pd +import sklearn import torch from sklearn import model_selection +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler from torch import nn, optim from torchmetrics.classification import BinaryAUROC from tqdm import tqdm +# from alphadia.fdr import get_q_values, keep_best + + + +def fdr_to_q_values(fdr_values: np.ndarray): + """Converts FDR values to q-values. + Takes a ascending sorted array of FDR values and converts them to q-values. + for every element the lowest FDR where it would be accepted is used as q-value. + + Parameters + ---------- + fdr_values : np.ndarray + The FDR values to convert. + + Returns + ------- + np.ndarray + The q-values. + """ + fdr_values_flipped = np.flip(fdr_values) + q_values_flipped = np.minimum.accumulate(fdr_values_flipped) + q_vals = np.flip(q_values_flipped) + return q_vals + + +def q_values( + scores: np.ndarray, + decoy_labels: np.ndarray, + # score_column : str = 'proba', + # decoy_column : str = '_decoy', + # qval_column : str = 'qval' +): + """Calculates q-values for a dataframe containing PSMs. + + Parameters + ---------- + + _df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str, default='proba' + The name of the column containing the score to use for the selection. + Ascending sorted values are expected. + + decoy_column : str, default='_decoy' + The name of the column containing the decoy information. + Decoys are expected to be 1 and targets 0. + + qval_column : str, default='qval' + The name of the column to store the q-values in. + + Returns + ------- + + pd.DataFrame + The dataframe containing the q-values in column qval. + + """ + + decoy_labels = decoy_labels[scores.argsort()] + target_values = 1 - decoy_labels + decoy_cumsum = np.cumsum(decoy_labels) + target_cumsum = np.cumsum(target_values) + fdr_values = decoy_cumsum / target_cumsum + return fdr_to_q_values(fdr_values) + + +def get_q_values( + _df: pd.DataFrame, + score_column: str = "proba", + decoy_column: str = "_decoy", + qval_column: str = "qval", +): + """Calculates q-values for a dataframe containing PSMs. + + Parameters + ---------- + + _df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str, default='proba' + The name of the column containing the score to use for the selection. + Ascending sorted values are expected. + + decoy_column : str, default='_decoy' + The name of the column containing the decoy information. + Decoys are expected to be 1 and targets 0. + + qval_column : str, default='qval' + The name of the column to store the q-values in. + + Returns + ------- + + pd.DataFrame + The dataframe containing the q-values in column qval. + + """ + _df = _df.sort_values([score_column, score_column], ascending=True) + target_values = 1 - _df[decoy_column].values + decoy_cumsum = np.cumsum(_df[decoy_column].values) + target_cumsum = np.cumsum(target_values) + fdr_values = decoy_cumsum / target_cumsum + _df[qval_column] = fdr_to_q_values(fdr_values) + return _df + + + + +def apply_absolute_transformations(df: pd.DataFrame) -> pd.DataFrame: + df_transformed = df.copy() + df_transformed["delta_rt"] = np.abs(df_transformed["delta_rt"]) + df_transformed["top_3_ms2_mass_error"] = np.abs(df_transformed["top_3_ms2_mass_error"]) + df_transformed["mean_ms2_mass_error"] = np.abs(df_transformed["mean_ms2_mass_error"]) + + return df_transformed + + class Classifier(ABC): """Abstract base class for classifiers. @@ -107,6 +230,365 @@ def from_state_dict(self, state_dict: dict): """ +class TwoStepClassifier(Classifier): + def __init__( + self, + first_classifier: Classifier, + second_classifier: Classifier, + train_on_top_n: int = 1, + first_fdr: float = 0.6, + second_fdr: float = 0.01, + **kwargs, + ): + super().__init__() + self.first_classifier = first_classifier + self.second_classifier = second_classifier + self.first_fdr = first_fdr + self.second_fdr = second_fdr + + self.train_on_top_n = train_on_top_n + + def fit_predict( + self, + df_: pd.DataFrame, + x_cols: list[str], + y_col: str, + group_columns: list[str] | None = None + ) -> pd.DataFrame: + """ + Return dataframe including only the found precursors. + """ + df = df_.copy() + df = apply_absolute_transformations(df) + + if self.first_classifier.fitted: + df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1] + df_subset = get_entries_below_fdr(df, self.first_fdr, group_columns, remove_decoys=False) + print(f"After q-val extraction, after LinClassifier (first_fdr={self.first_fdr}): {df_subset.shape}") + + self.second_classifier.batch_size = 500 + self.second_classifier.epochs = 50 + + self.second_classifier.fit(df_subset[x_cols].to_numpy(), df_subset[y_col].to_numpy()) + df_subset["proba"] = self.second_classifier.predict_proba(df_subset[x_cols].to_numpy())[:, 1] + + else: + df_train = df[df["rank"] < self.train_on_top_n] + self.second_classifier.fit( + df_train[x_cols].to_numpy(), + df_train[y_col].to_numpy(), + ) + + df_subset = df + x_subset = df_subset[x_cols].to_numpy() + df_subset["proba"] = self.second_classifier.predict_proba(x_subset)[:, 1] + + df_subset = get_entries_below_fdr(df_subset, self.second_fdr, group_columns) # , remove_decoys=True) + print(f"After q-val extraction, after NN (second_fdr={self.second_fdr}): {df_subset.shape}") + + df_subset_2 = get_target_decoy_partners(df_subset, df_) + df_targets = df_subset_2[df_subset_2["decoy"] == 0] + + self._update_classifier( + self.first_classifier, + df_subset_2, + x_cols, + y_col, + self.first_fdr, + group_columns, + ) + + return df_targets + + @classmethod + def _update_classifier(cls, classifier, df_, x_cols, y_col, fdr, group_columns) -> None: + + X = df_[x_cols] + y = df_[y_col] + df = df_.copy() + if hasattr(classifier, "fitted") and classifier.fitted: + df["proba"] = classifier.predict_proba(df[x_cols].to_numpy())[:, 1] + df_targets = get_entries_below_fdr(df, fdr, group_columns) + previous_n_precursors = len(df_targets) + saved_state_dict = classifier.to_state_dict() + else: + previous_n_precursors = -1 + + classifier.fit(X, y) + classifier._fitted = True + + df["proba"] = classifier.predict_proba(df[x_cols].to_numpy())[:, 1] + df_targets = get_entries_below_fdr(df, fdr, group_columns) + + current_n_precursors = len(df_targets) + if previous_n_precursors > current_n_precursors: + print("Reverting linear classifier to the previous version, as the new one performed worse.") + classifier.from_state_dict(saved_state_dict) + + + @property + def fitted(self) -> bool: + """Return whether both classifiers have been fitted.""" + return self.second_classifier.fitted + + def fit(self, x: np.ndarray, y: np.ndarray) -> None: + """Fit both classifiers sequentially. + + Parameters + ---------- + x : np.ndarray + Training data + y : np.ndarray + Target values + """ + # First classifier fit + # self.first_classifier.fit(x, y) + + # # Get predictions from first classifier + # probs = self.first_classifier.predict_proba(x)[:, 1] + + # # Filter data based on first FDR threshold + # mask = probs >= (1 - self.first_fdr) + # x_filtered = x[mask] + # y_filtered = y[mask] + + # # Fit second classifier on filtered data + # self.second_classifier.fit(x_filtered, y_filtered) + pass + + def predict(self, x: np.ndarray) -> np.ndarray: + """Predict class labels using both classifiers. + + Parameters + ---------- + x : np.ndarray + Input data + + Returns + ------- + np.ndarray + Predicted class labels + """ + return np.argmax(self.predict_proba(x), axis=1) + + def predict_proba(self, x: np.ndarray) -> np.ndarray: + """Predict class probabilities using both classifiers. + + Parameters + ---------- + x : np.ndarray + Input data + + Returns + ------- + np.ndarray + Predicted class probabilities + """ + pass + + + def to_state_dict(self) -> dict: + """Save classifier state. + + Returns + ------- + dict + State dictionary containing both classifiers + """ + return { + "first_classifier": self.first_classifier.to_state_dict(), + "second_classifier": self.second_classifier.to_state_dict(), + "first_fdr": self.first_fdr, + "second_fdr": self.second_fdr, + "train_on_top_n": self.train_on_top_n + } + + def from_state_dict(self, state_dict: dict) -> None: + """Load classifier state. + + Parameters + ---------- + state_dict : dict + State dictionary containing both classifiers + """ + self.first_classifier.from_state_dict(state_dict["first_classifier"]) + self.second_classifier.from_state_dict(state_dict["second_classifier"]) + self.first_fdr = state_dict["first_fdr"] + self.second_fdr = state_dict["second_fdr"] + self.train_on_top_n = state_dict["train_on_top_n"] + +def get_entries_below_fdr(df, fdr, group_columns, remove_decoys: bool = True): + df.sort_values("proba", ascending=True, inplace=True) + df = keep_best(df, group_columns=group_columns) + df = get_q_values(df, "proba", "decoy") + + df_subset = df[df["qval"] < fdr] + if remove_decoys: + df_subset = df_subset[df_subset["decoy"] == 0] + return df_subset + + +def get_target_decoy_partners(df_sub, df_all): + group_by = ["rank", "elution_group_idx"] + valid_tuples = df_sub[group_by] + matching_rows = df_all.merge(valid_tuples, on=group_by, how="inner") + + return matching_rows + + +def keep_best( + df: pd.DataFrame, + score_column: str = "proba", + group_columns: list[str] | None = None, +): + """Keep the best PSM for each group of PSMs with the same precursor_idx. + This function is used to select the best candidate PSM for each precursor. + if the group_columns is set to ['channel', 'elution_group_idx'] then its used for target decoy competition. + + Parameters + ---------- + + df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str + The name of the column containing the score to use for the selection. + + group_columns : list[str], default=['channel', 'precursor_idx'] + The columns to use for the grouping. + + Returns + ------- + + pd.DataFrame + The dataframe containing the best PSM for each group. + """ + if group_columns is None: + group_columns = ["channel", "precursor_idx"] + temp_df = df.reset_index(drop=True) + temp_df = temp_df.sort_values(score_column, ascending=True) + temp_df = temp_df.groupby(group_columns).head(1) + temp_df = temp_df.sort_index().reset_index(drop=True) + return temp_df + +class LogisticRegressionClassifier(Classifier): + def __init__(self) -> None: + """Linear classifier using a logistic regression model.""" + self.scaler = StandardScaler() + self.model = LogisticRegression() + self._fitted = False + + @property + def fitted(self) -> bool: + return self._fitted + + def fit(self, x: np.ndarray, y: np.ndarray) -> None: + """Fit the classifier to the data. + + Parameters + ---------- + + x : np.array, dtype=float + Training data of shape (n_samples, n_features). + + y : np.array, dtype=int + Target values of shape (n_samples,) or (n_samples, n_classes). + + """ + x_scaled = self.scaler.fit_transform(x) + self.model.fit(x_scaled, y) + self._fitted = True + + def predict(self, x: np.ndarray) -> np.ndarray: + """Predict the class probabilities of the data. + + Parameters + ---------- + + x : np.array, dtype=float + Data of shape (n_samples, n_features). + + Returns + ------- + + y : np.array, dtype=float + Predicted class probabilities of shape (n_samples, n_classes). + + """ + x_scaled = self.scaler.transform(x) + return self.model.predict(x_scaled) + + def predict_proba(self, x: np.ndarray) -> np.ndarray: + """Predict the class probabilities of the data. + + Parameters + ---------- + + x : np.array, dtype=float + Data of shape (n_samples, n_features). + + Returns + ------- + + y : np.array, dtype=float + Predicted class probabilities of shape (n_samples, n_classes). + + """ + x_scaled = self.scaler.transform(x) + return self.model.predict_proba(x_scaled) + + def to_state_dict(self) -> dict: + """Save the state of the classifier as a dictionary. + + Returns + ------- + + dict : dict + Dictionary containing the state of the classifier. + + """ + state_dict = {"_fitted": self._fitted} + + if self._fitted: + state_dict.update({ + 'scaler_mean': self.scaler.mean_, + 'scaler_var': self.scaler.var_, + 'scaler_scale': self.scaler.scale_, + 'scaler_n_samples_seen': self.scaler.n_samples_seen_, + 'model_coef': self.model.coef_, + 'model_intercept': self.model.intercept_, + 'model_classes': self.model.classes_, + 'is_fitted': self._fitted + }) + + return state_dict + + def from_state_dict(self, state_dict: dict): + """Load the state of the classifier from a dictionary. + + Parameters + ---------- + + dict : dict + Dictionary containing the state of the classifier. + + """ + self._fitted = state_dict["_fitted"] + + if self.fitted: + self.scaler = StandardScaler() + self.scaler.mean_ = np.array(state_dict['scaler_mean']) + self.scaler.var_ = np.array(state_dict['scaler_var']) + self.scaler.scale_ = np.array(state_dict['scaler_scale']) + self.scaler.n_samples_seen_ = np.array(state_dict['scaler_n_samples_seen']) + + self.model = LogisticRegression() + self.model.coef_ = np.array(state_dict['model_coef']) + self.model.intercept_ = np.array(state_dict['model_intercept']) + self.model.classes_ = np.array(state_dict['model_classes']) + + + class BinaryClassifier(Classifier): def __init__( self, @@ -1230,6 +1712,7 @@ def predict_proba(self, x: np.ndarray): return self.network(torch.Tensor(x)).detach().numpy() + class FeedForwardNN(nn.Module): def __init__( self, From e6d3e3b548b1a45cac8b7ba179603da2523e25ca Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 16 Jan 2025 14:24:17 +0100 Subject: [PATCH 02/24] add config param to enable 2-step-classifier --- alphadia/constants/default.yaml | 1 + alphadia/fdr.py | 19 +++ alphadia/fdrexperimental.py | 180 +++++++++++++++------------- alphadia/workflow/manager.py | 44 +++++-- alphadia/workflow/peptidecentric.py | 26 +++- 5 files changed, 172 insertions(+), 98 deletions(-) diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml index 3a174b5a..0dbbca9c 100644 --- a/alphadia/constants/default.yaml +++ b/alphadia/constants/default.yaml @@ -225,6 +225,7 @@ fdr: keep_decoys: false channel_wise_fdr: false inference_strategy: "heuristic" + enable_two_step_classifier: false search_output: peptide_level_lfq: false diff --git a/alphadia/fdr.py b/alphadia/fdr.py index 5c9911cb..1c3ccf6d 100644 --- a/alphadia/fdr.py +++ b/alphadia/fdr.py @@ -10,6 +10,8 @@ import pandas as pd import sklearn +import alphadia.fdrexperimental as fdrx + # alphadia imports # alpha family imports from alphadia import fragcomp @@ -168,6 +170,23 @@ def perform_fdr( return psm_df +def perform_fdr_new( + classifier: fdrx.Classifier, + available_columns: list[str], + df: pd.DataFrame, + group_columns, + **kwargs, +): + df.dropna(subset=available_columns, inplace=True) + psm_df = classifier.fit_predict( + df, + available_columns + ["score"], + "decoy", + group_columns, + ) + return psm_df + + def keep_best( df: pd.DataFrame, score_column: str = "proba", diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index e6f72d59..4e7304f8 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -8,7 +8,6 @@ # third party imports import numpy as np import pandas as pd -import sklearn import torch from sklearn import model_selection from sklearn.linear_model import LogisticRegression @@ -17,9 +16,6 @@ from torchmetrics.classification import BinaryAUROC from tqdm import tqdm -# from alphadia.fdr import get_q_values, keep_best - - def fdr_to_q_values(fdr_values: np.ndarray): """Converts FDR values to q-values. @@ -125,18 +121,19 @@ def get_q_values( return _df - - def apply_absolute_transformations(df: pd.DataFrame) -> pd.DataFrame: df_transformed = df.copy() df_transformed["delta_rt"] = np.abs(df_transformed["delta_rt"]) - df_transformed["top_3_ms2_mass_error"] = np.abs(df_transformed["top_3_ms2_mass_error"]) - df_transformed["mean_ms2_mass_error"] = np.abs(df_transformed["mean_ms2_mass_error"]) + df_transformed["top_3_ms2_mass_error"] = np.abs( + df_transformed["top_3_ms2_mass_error"] + ) + df_transformed["mean_ms2_mass_error"] = np.abs( + df_transformed["mean_ms2_mass_error"] + ) return df_transformed - class Classifier(ABC): """Abstract base class for classifiers. @@ -245,50 +242,64 @@ def __init__( self.second_classifier = second_classifier self.first_fdr = first_fdr self.second_fdr = second_fdr - + self.train_on_top_n = train_on_top_n - + def fit_predict( - self, - df_: pd.DataFrame, - x_cols: list[str], - y_col: str, - group_columns: list[str] | None = None + self, + df_: pd.DataFrame, + x_cols: list[str], + y_col: str, + group_columns: list[str] | None = None, ) -> pd.DataFrame: """ Return dataframe including only the found precursors. """ df = df_.copy() df = apply_absolute_transformations(df) - + if self.first_classifier.fitted: - df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1] - df_subset = get_entries_below_fdr(df, self.first_fdr, group_columns, remove_decoys=False) - print(f"After q-val extraction, after LinClassifier (first_fdr={self.first_fdr}): {df_subset.shape}") - + df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[ + :, 1 + ] + df_subset = get_entries_below_fdr( + df, self.first_fdr, group_columns, remove_decoys=False + ) + print( + f"After q-val extraction, after LinClassifier (first_fdr={self.first_fdr}): {df_subset.shape}" + ) + self.second_classifier.batch_size = 500 self.second_classifier.epochs = 50 - - self.second_classifier.fit(df_subset[x_cols].to_numpy(), df_subset[y_col].to_numpy()) - df_subset["proba"] = self.second_classifier.predict_proba(df_subset[x_cols].to_numpy())[:, 1] - + + self.second_classifier.fit( + df_subset[x_cols].to_numpy(), df_subset[y_col].to_numpy() + ) + df_subset["proba"] = self.second_classifier.predict_proba( + df_subset[x_cols].to_numpy() + )[:, 1] + else: df_train = df[df["rank"] < self.train_on_top_n] self.second_classifier.fit( df_train[x_cols].to_numpy(), df_train[y_col].to_numpy(), ) - + df_subset = df x_subset = df_subset[x_cols].to_numpy() df_subset["proba"] = self.second_classifier.predict_proba(x_subset)[:, 1] - - df_subset = get_entries_below_fdr(df_subset, self.second_fdr, group_columns) # , remove_decoys=True) - print(f"After q-val extraction, after NN (second_fdr={self.second_fdr}): {df_subset.shape}") - + + df_subset = get_entries_below_fdr( + df_subset, self.second_fdr, group_columns + ) # , remove_decoys=True) + print( + f"After q-val extraction, after NN (second_fdr={self.second_fdr}): {df_subset.shape}" + ) + df_subset_2 = get_target_decoy_partners(df_subset, df_) df_targets = df_subset_2[df_subset_2["decoy"] == 0] - + self._update_classifier( self.first_classifier, df_subset_2, @@ -297,12 +308,13 @@ def fit_predict( self.first_fdr, group_columns, ) - + return df_targets - + @classmethod - def _update_classifier(cls, classifier, df_, x_cols, y_col, fdr, group_columns) -> None: - + def _update_classifier( + cls, classifier, df_, x_cols, y_col, fdr, group_columns + ) -> None: X = df_[x_cols] y = df_[y_col] df = df_.copy() @@ -316,54 +328,55 @@ def _update_classifier(cls, classifier, df_, x_cols, y_col, fdr, group_columns) classifier.fit(X, y) classifier._fitted = True - + df["proba"] = classifier.predict_proba(df[x_cols].to_numpy())[:, 1] df_targets = get_entries_below_fdr(df, fdr, group_columns) - + current_n_precursors = len(df_targets) if previous_n_precursors > current_n_precursors: - print("Reverting linear classifier to the previous version, as the new one performed worse.") - classifier.from_state_dict(saved_state_dict) - + print( + "Reverting linear classifier to the previous version, as the new one performed worse." + ) + classifier.from_state_dict(saved_state_dict) - @property + @property def fitted(self) -> bool: """Return whether both classifiers have been fitted.""" return self.second_classifier.fitted def fit(self, x: np.ndarray, y: np.ndarray) -> None: """Fit both classifiers sequentially. - + Parameters ---------- x : np.ndarray Training data - y : np.ndarray + y : np.ndarray Target values """ # First classifier fit # self.first_classifier.fit(x, y) - + # # Get predictions from first classifier # probs = self.first_classifier.predict_proba(x)[:, 1] - + # # Filter data based on first FDR threshold # mask = probs >= (1 - self.first_fdr) # x_filtered = x[mask] # y_filtered = y[mask] - + # # Fit second classifier on filtered data # self.second_classifier.fit(x_filtered, y_filtered) pass def predict(self, x: np.ndarray) -> np.ndarray: """Predict class labels using both classifiers. - + Parameters ---------- x : np.ndarray Input data - + Returns ------- np.ndarray @@ -373,12 +386,12 @@ def predict(self, x: np.ndarray) -> np.ndarray: def predict_proba(self, x: np.ndarray) -> np.ndarray: """Predict class probabilities using both classifiers. - + Parameters ---------- x : np.ndarray Input data - + Returns ------- np.ndarray @@ -386,10 +399,9 @@ def predict_proba(self, x: np.ndarray) -> np.ndarray: """ pass - def to_state_dict(self) -> dict: """Save classifier state. - + Returns ------- dict @@ -400,12 +412,12 @@ def to_state_dict(self) -> dict: "second_classifier": self.second_classifier.to_state_dict(), "first_fdr": self.first_fdr, "second_fdr": self.second_fdr, - "train_on_top_n": self.train_on_top_n + "train_on_top_n": self.train_on_top_n, } def from_state_dict(self, state_dict: dict) -> None: """Load classifier state. - + Parameters ---------- state_dict : dict @@ -414,12 +426,13 @@ def from_state_dict(self, state_dict: dict) -> None: self.first_classifier.from_state_dict(state_dict["first_classifier"]) self.second_classifier.from_state_dict(state_dict["second_classifier"]) self.first_fdr = state_dict["first_fdr"] - self.second_fdr = state_dict["second_fdr"] + self.second_fdr = state_dict["second_fdr"] self.train_on_top_n = state_dict["train_on_top_n"] - + + def get_entries_below_fdr(df, fdr, group_columns, remove_decoys: bool = True): df.sort_values("proba", ascending=True, inplace=True) - df = keep_best(df, group_columns=group_columns) + df = keep_best(df, group_columns=group_columns) df = get_q_values(df, "proba", "decoy") df_subset = df[df["qval"] < fdr] @@ -427,12 +440,12 @@ def get_entries_below_fdr(df, fdr, group_columns, remove_decoys: bool = True): df_subset = df_subset[df_subset["decoy"] == 0] return df_subset - + def get_target_decoy_partners(df_sub, df_all): group_by = ["rank", "elution_group_idx"] valid_tuples = df_sub[group_by] matching_rows = df_all.merge(valid_tuples, on=group_by, how="inner") - + return matching_rows @@ -471,6 +484,7 @@ def keep_best( temp_df = temp_df.sort_index().reset_index(drop=True) return temp_df + class LogisticRegressionClassifier(Classifier): def __init__(self) -> None: """Linear classifier using a logistic regression model.""" @@ -481,7 +495,7 @@ def __init__(self) -> None: @property def fitted(self) -> bool: return self._fitted - + def fit(self, x: np.ndarray, y: np.ndarray) -> None: """Fit the classifier to the data. @@ -517,7 +531,7 @@ def predict(self, x: np.ndarray) -> np.ndarray: """ x_scaled = self.scaler.transform(x) return self.model.predict(x_scaled) - + def predict_proba(self, x: np.ndarray) -> np.ndarray: """Predict the class probabilities of the data. @@ -536,7 +550,7 @@ def predict_proba(self, x: np.ndarray) -> np.ndarray: """ x_scaled = self.scaler.transform(x) return self.model.predict_proba(x_scaled) - + def to_state_dict(self) -> dict: """Save the state of the classifier as a dictionary. @@ -550,16 +564,18 @@ def to_state_dict(self) -> dict: state_dict = {"_fitted": self._fitted} if self._fitted: - state_dict.update({ - 'scaler_mean': self.scaler.mean_, - 'scaler_var': self.scaler.var_, - 'scaler_scale': self.scaler.scale_, - 'scaler_n_samples_seen': self.scaler.n_samples_seen_, - 'model_coef': self.model.coef_, - 'model_intercept': self.model.intercept_, - 'model_classes': self.model.classes_, - 'is_fitted': self._fitted - }) + state_dict.update( + { + "scaler_mean": self.scaler.mean_, + "scaler_var": self.scaler.var_, + "scaler_scale": self.scaler.scale_, + "scaler_n_samples_seen": self.scaler.n_samples_seen_, + "model_coef": self.model.coef_, + "model_intercept": self.model.intercept_, + "model_classes": self.model.classes_, + "is_fitted": self._fitted, + } + ) return state_dict @@ -574,19 +590,18 @@ def from_state_dict(self, state_dict: dict): """ self._fitted = state_dict["_fitted"] - + if self.fitted: self.scaler = StandardScaler() - self.scaler.mean_ = np.array(state_dict['scaler_mean']) - self.scaler.var_ = np.array(state_dict['scaler_var']) - self.scaler.scale_ = np.array(state_dict['scaler_scale']) - self.scaler.n_samples_seen_ = np.array(state_dict['scaler_n_samples_seen']) - - self.model = LogisticRegression() - self.model.coef_ = np.array(state_dict['model_coef']) - self.model.intercept_ = np.array(state_dict['model_intercept']) - self.model.classes_ = np.array(state_dict['model_classes']) + self.scaler.mean_ = np.array(state_dict["scaler_mean"]) + self.scaler.var_ = np.array(state_dict["scaler_var"]) + self.scaler.scale_ = np.array(state_dict["scaler_scale"]) + self.scaler.n_samples_seen_ = np.array(state_dict["scaler_n_samples_seen"]) + self.model = LogisticRegression() + self.model.coef_ = np.array(state_dict["model_coef"]) + self.model.intercept_ = np.array(state_dict["model_intercept"]) + self.model.classes_ = np.array(state_dict["model_classes"]) class BinaryClassifier(Classifier): @@ -1712,7 +1727,6 @@ def predict_proba(self, x: np.ndarray): return self.network(torch.Tensor(x)).detach().numpy() - class FeedForwardNN(nn.Module): def __init__( self, diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 04c12bbf..8d89ac74 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -514,6 +514,18 @@ def fit_predict(self, update_dict): return self.predict() +def get_group_columns(competetive, group_channels): + if competetive: + group_columns = ( + ["elution_group_idx", "channel"] + if group_channels + else ["elution_group_idx"] + ) + else: + group_columns = ["precursor_idx"] + return group_columns + + class FDRManager(BaseManager): def __init__( self, @@ -521,6 +533,7 @@ def __init__( classifier_base, path: None | str = None, load_from_file: bool = True, + enable_two_step_classifier: bool = False, **kwargs, ): """Contains, updates and applies classifiers for target-decoy competitio-based false discovery rate (FDR) estimation. @@ -541,6 +554,8 @@ def __init__( self.feature_columns = feature_columns self.classifier_store = defaultdict(list) self.classifier_base = classifier_base + self.enable_two_step_classifier = enable_two_step_classifier + self._current_version = -1 self.load_classifier_store() @@ -609,17 +624,24 @@ def fit_predict( classifier = self.get_classifier(available_columns, version) if decoy_strategy == "precursor": - psm_df = fdr.perform_fdr( - classifier, - available_columns, - features_df[features_df["decoy"] == 0].copy(), - features_df[features_df["decoy"] == 1].copy(), - competetive=competetive, - group_channels=True, - df_fragments=df_fragments, - dia_cycle=dia_cycle, - figure_path=self.figure_path, - ) + if not self.two_step_classifier: + psm_df = fdr.perform_fdr( + classifier, + available_columns, + features_df[features_df["decoy"] == 0].copy(), + features_df[features_df["decoy"] == 1].copy(), + competetive=competetive, + group_channels=True, + df_fragments=df_fragments, + dia_cycle=dia_cycle, + figure_path=self.figure_path, + ) + else: + group_columns = get_group_columns(competetive, group_channels=True) + psm_df = fdr.perform_fdr_new( + classifier, available_columns, features_df, group_columns + ) + elif decoy_strategy == "precursor_channel_wise": channels = features_df["channel"].unique() psm_df_list = [] diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index b03c05c0..d6d00b6a 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -94,9 +94,25 @@ "mean_overlapping_mass_error", ] -classifier_base = fdrx.BinaryClassifierLegacyNewBatching( - test_size=0.001, batch_size=5000, learning_rate=0.001, epochs=10 -) + +def get_classifier_base(enable_two_step_classifier: bool = False): + if enable_two_step_classifier: + first_classifier = fdrx.LogisticRegressionClassifier( + apply_abs_transformations=True + ) + second_classifier = fdrx.BinaryClassifierLegacyNewBatching( + test_size=0.001, batch_size=5000, learning_rate=0.001, epochs=10 + ) + + classifier_base = fdrx.TwoStepClassifier( + first_classifier=first_classifier, + second_classifier=second_classifier, + ) + else: + classifier_base = fdrx.BinaryClassifierLegacyNewBatching( + test_size=0.001, batch_size=5000, learning_rate=0.001, epochs=10 + ) + return classifier_base class PeptideCentricWorkflow(base.WorkflowBase): @@ -133,7 +149,9 @@ def load( def init_fdr_manager(self): self.fdr_manager = manager.FDRManager( feature_columns=feature_columns, - classifier_base=classifier_base, + classifier_base=get_classifier_base( + self.config["fdr"]["enable_two_step_classifier"] + ), ) def init_spectral_library(self): From 1c2065cc2284e4cff27c67e2d123994f9b759ece Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 16 Jan 2025 14:42:02 +0100 Subject: [PATCH 03/24] fix FDRManger two-step-classifier parameter --- alphadia/workflow/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 8d89ac74..3ff37540 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -624,7 +624,7 @@ def fit_predict( classifier = self.get_classifier(available_columns, version) if decoy_strategy == "precursor": - if not self.two_step_classifier: + if not self.enable_two_step_classifier: psm_df = fdr.perform_fdr( classifier, available_columns, From 29168e4076dd8ba20187c60a233b067a4d87254c Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 16 Jan 2025 15:30:43 +0100 Subject: [PATCH 04/24] extract fdr utility functions due to circular import --- alphadia/fdr.py | 251 +--------------------------------- alphadia/fdr_utils.py | 260 ++++++++++++++++++++++++++++++++++++ alphadia/fdrexperimental.py | 140 +------------------ 3 files changed, 262 insertions(+), 389 deletions(-) create mode 100644 alphadia/fdr_utils.py diff --git a/alphadia/fdr.py b/alphadia/fdr.py index 1c3ccf6d..85a78de7 100644 --- a/alphadia/fdr.py +++ b/alphadia/fdr.py @@ -1,9 +1,6 @@ # native imports import logging -import os -import matplotlib as mpl -import matplotlib.pyplot as plt import numpy as np # third party imports @@ -15,6 +12,7 @@ # alphadia imports # alpha family imports from alphadia import fragcomp +from alphadia.fdr_utils import get_q_values, keep_best, plot_fdr logger = logging.getLogger() @@ -185,250 +183,3 @@ def perform_fdr_new( group_columns, ) return psm_df - - -def keep_best( - df: pd.DataFrame, - score_column: str = "proba", - group_columns: list[str] | None = None, -): - """Keep the best PSM for each group of PSMs with the same precursor_idx. - This function is used to select the best candidate PSM for each precursor. - if the group_columns is set to ['channel', 'elution_group_idx'] then its used for target decoy competition. - - Parameters - ---------- - - df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str - The name of the column containing the score to use for the selection. - - group_columns : list[str], default=['channel', 'precursor_idx'] - The columns to use for the grouping. - - Returns - ------- - - pd.DataFrame - The dataframe containing the best PSM for each group. - """ - if group_columns is None: - group_columns = ["channel", "precursor_idx"] - temp_df = df.reset_index(drop=True) - temp_df = temp_df.sort_values(score_column, ascending=True) - temp_df = temp_df.groupby(group_columns).head(1) - temp_df = temp_df.sort_index().reset_index(drop=True) - return temp_df - - -def fdr_to_q_values(fdr_values: np.ndarray): - """Converts FDR values to q-values. - Takes a ascending sorted array of FDR values and converts them to q-values. - for every element the lowest FDR where it would be accepted is used as q-value. - - Parameters - ---------- - fdr_values : np.ndarray - The FDR values to convert. - - Returns - ------- - np.ndarray - The q-values. - """ - fdr_values_flipped = np.flip(fdr_values) - q_values_flipped = np.minimum.accumulate(fdr_values_flipped) - q_vals = np.flip(q_values_flipped) - return q_vals - - -def q_values( - scores: np.ndarray, - decoy_labels: np.ndarray, - # score_column : str = 'proba', - # decoy_column : str = '_decoy', - # qval_column : str = 'qval' -): - """Calculates q-values for a dataframe containing PSMs. - - Parameters - ---------- - - _df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str, default='proba' - The name of the column containing the score to use for the selection. - Ascending sorted values are expected. - - decoy_column : str, default='_decoy' - The name of the column containing the decoy information. - Decoys are expected to be 1 and targets 0. - - qval_column : str, default='qval' - The name of the column to store the q-values in. - - Returns - ------- - - pd.DataFrame - The dataframe containing the q-values in column qval. - - """ - - decoy_labels = decoy_labels[scores.argsort()] - target_values = 1 - decoy_labels - decoy_cumsum = np.cumsum(decoy_labels) - target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum / target_cumsum - return fdr_to_q_values(fdr_values) - - -def get_q_values( - _df: pd.DataFrame, - score_column: str = "proba", - decoy_column: str = "_decoy", - qval_column: str = "qval", -): - """Calculates q-values for a dataframe containing PSMs. - - Parameters - ---------- - - _df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str, default='proba' - The name of the column containing the score to use for the selection. - Ascending sorted values are expected. - - decoy_column : str, default='_decoy' - The name of the column containing the decoy information. - Decoys are expected to be 1 and targets 0. - - qval_column : str, default='qval' - The name of the column to store the q-values in. - - Returns - ------- - - pd.DataFrame - The dataframe containing the q-values in column qval. - - """ - _df = _df.sort_values([score_column, score_column], ascending=True) - target_values = 1 - _df[decoy_column].values - decoy_cumsum = np.cumsum(_df[decoy_column].values) - target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum / target_cumsum - _df[qval_column] = fdr_to_q_values(fdr_values) - return _df - - -def plot_fdr( - X_train: np.ndarray, - X_test: np.ndarray, - y_train: np.ndarray, - y_test: np.ndarray, - classifier: sklearn.base.BaseEstimator, - qval: np.ndarray, - figure_path: str | None = None, - neptune_run=None, -): - """Plots statistics on the fdr corrected PSMs. - - Parameters - ---------- - - X_train : np.ndarray - The training data. - - X_test : np.ndarray - The test data. - - y_train : np.ndarray - The training labels. - - y_test : np.ndarray - The test labels. - - classifier : sklearn.base.BaseEstimator - The classifier used for the prediction. - - qval : np.ndarray - The q-values of the PSMs. - """ - - y_test_proba = classifier.predict_proba(X_test)[:, 1] - - y_train_proba = classifier.predict_proba(X_train)[:, 1] - - fpr_test, tpr_test, thresholds_test = sklearn.metrics.roc_curve( - y_test, y_test_proba - ) - fpr_train, tpr_train, thresholds_train = sklearn.metrics.roc_curve( - y_train, y_train_proba - ) - - auc_test = sklearn.metrics.auc(fpr_test, tpr_test) - auc_train = sklearn.metrics.auc(fpr_train, tpr_train) - - logger.info(f"Test AUC: {auc_test:.3f}") - logger.info(f"Train AUC: {auc_train:.3f}") - - auc_difference_percent = np.abs((auc_test - auc_train) / auc_train * 100) - logger.info(f"AUC difference: {auc_difference_percent:.2f}%") - if auc_difference_percent > 5: - logger.warning("AUC difference > 5%. This may indicate overfitting.") - - fig, ax = plt.subplots(1, 3, figsize=(12, 4)) - ax[0].plot(fpr_test, tpr_test, label=f"Test AUC: {auc_test:.3f}") - ax[0].plot(fpr_train, tpr_train, label=f"Train AUC: {auc_train:.3f}") - ax[0].set_xlabel("false positive rate") - ax[0].set_ylabel("true positive rate") - ax[0].legend() - - ax[1].set_xlim(0, 1) - ax[1].hist( - np.concatenate([y_test_proba[y_test == 0], y_train_proba[y_train == 0]]), - bins=50, - alpha=0.5, - label="target", - ) - ax[1].hist( - np.concatenate([y_test_proba[y_test == 1], y_train_proba[y_train == 1]]), - bins=50, - alpha=0.5, - label="decoy", - ) - ax[1].set_xlabel("decoy score") - ax[1].set_ylabel("precursor count") - ax[1].legend() - - qval_plot = qval[qval < 0.05] - ids = np.arange(0, len(qval_plot), 1) - ax[2].plot(qval_plot, ids) - ax[2].set_xlim(-0.001, 0.05) - ax[2].set_xlabel("q-value") - ax[2].set_ylabel("number of precursors") - - for axs in ax: - # remove top and right spines - axs.spines["top"].set_visible(False) - axs.spines["right"].set_visible(False) - axs.get_yaxis().set_major_formatter( - mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ",")) - ) - - fig.tight_layout() - plt.show() - - if figure_path is not None: - fig.savefig(os.path.join(figure_path, "fdr.pdf")) - - if neptune_run is not None: - neptune_run["eval/fdr"].log(fig) - - plt.close() diff --git a/alphadia/fdr_utils.py b/alphadia/fdr_utils.py new file mode 100644 index 00000000..0c677bd1 --- /dev/null +++ b/alphadia/fdr_utils.py @@ -0,0 +1,260 @@ +# native imports +import logging +import os + +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np + +# third party imports +import pandas as pd +import sklearn + +logger = logging.getLogger() + + +def keep_best( + df: pd.DataFrame, + score_column: str = "proba", + group_columns: list[str] | None = None, +): + """Keep the best PSM for each group of PSMs with the same precursor_idx. + This function is used to select the best candidate PSM for each precursor. + if the group_columns is set to ['channel', 'elution_group_idx'] then its used for target decoy competition. + + Parameters + ---------- + + df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str + The name of the column containing the score to use for the selection. + + group_columns : list[str], default=['channel', 'precursor_idx'] + The columns to use for the grouping. + + Returns + ------- + + pd.DataFrame + The dataframe containing the best PSM for each group. + """ + if group_columns is None: + group_columns = ["channel", "precursor_idx"] + temp_df = df.reset_index(drop=True) + temp_df = temp_df.sort_values(score_column, ascending=True) + temp_df = temp_df.groupby(group_columns).head(1) + temp_df = temp_df.sort_index().reset_index(drop=True) + return temp_df + + +def fdr_to_q_values(fdr_values: np.ndarray): + """Converts FDR values to q-values. + Takes a ascending sorted array of FDR values and converts them to q-values. + for every element the lowest FDR where it would be accepted is used as q-value. + + Parameters + ---------- + fdr_values : np.ndarray + The FDR values to convert. + + Returns + ------- + np.ndarray + The q-values. + """ + fdr_values_flipped = np.flip(fdr_values) + q_values_flipped = np.minimum.accumulate(fdr_values_flipped) + q_vals = np.flip(q_values_flipped) + return q_vals + + +def q_values( + scores: np.ndarray, + decoy_labels: np.ndarray, + # score_column : str = 'proba', + # decoy_column : str = '_decoy', + # qval_column : str = 'qval' +): + """Calculates q-values for a dataframe containing PSMs. + + Parameters + ---------- + + _df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str, default='proba' + The name of the column containing the score to use for the selection. + Ascending sorted values are expected. + + decoy_column : str, default='_decoy' + The name of the column containing the decoy information. + Decoys are expected to be 1 and targets 0. + + qval_column : str, default='qval' + The name of the column to store the q-values in. + + Returns + ------- + + pd.DataFrame + The dataframe containing the q-values in column qval. + + """ + + decoy_labels = decoy_labels[scores.argsort()] + target_values = 1 - decoy_labels + decoy_cumsum = np.cumsum(decoy_labels) + target_cumsum = np.cumsum(target_values) + fdr_values = decoy_cumsum / target_cumsum + return fdr_to_q_values(fdr_values) + + +def get_q_values( + _df: pd.DataFrame, + score_column: str = "proba", + decoy_column: str = "_decoy", + qval_column: str = "qval", +): + """Calculates q-values for a dataframe containing PSMs. + + Parameters + ---------- + + _df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str, default='proba' + The name of the column containing the score to use for the selection. + Ascending sorted values are expected. + + decoy_column : str, default='_decoy' + The name of the column containing the decoy information. + Decoys are expected to be 1 and targets 0. + + qval_column : str, default='qval' + The name of the column to store the q-values in. + + Returns + ------- + + pd.DataFrame + The dataframe containing the q-values in column qval. + + """ + _df = _df.sort_values([score_column, score_column], ascending=True) + target_values = 1 - _df[decoy_column].values + decoy_cumsum = np.cumsum(_df[decoy_column].values) + target_cumsum = np.cumsum(target_values) + fdr_values = decoy_cumsum / target_cumsum + _df[qval_column] = fdr_to_q_values(fdr_values) + return _df + + +def plot_fdr( + X_train: np.ndarray, + X_test: np.ndarray, + y_train: np.ndarray, + y_test: np.ndarray, + classifier: sklearn.base.BaseEstimator, + qval: np.ndarray, + figure_path: str | None = None, + neptune_run=None, +): + """Plots statistics on the fdr corrected PSMs. + + Parameters + ---------- + + X_train : np.ndarray + The training data. + + X_test : np.ndarray + The test data. + + y_train : np.ndarray + The training labels. + + y_test : np.ndarray + The test labels. + + classifier : sklearn.base.BaseEstimator + The classifier used for the prediction. + + qval : np.ndarray + The q-values of the PSMs. + """ + + y_test_proba = classifier.predict_proba(X_test)[:, 1] + + y_train_proba = classifier.predict_proba(X_train)[:, 1] + + fpr_test, tpr_test, thresholds_test = sklearn.metrics.roc_curve( + y_test, y_test_proba + ) + fpr_train, tpr_train, thresholds_train = sklearn.metrics.roc_curve( + y_train, y_train_proba + ) + + auc_test = sklearn.metrics.auc(fpr_test, tpr_test) + auc_train = sklearn.metrics.auc(fpr_train, tpr_train) + + logger.info(f"Test AUC: {auc_test:.3f}") + logger.info(f"Train AUC: {auc_train:.3f}") + + auc_difference_percent = np.abs((auc_test - auc_train) / auc_train * 100) + logger.info(f"AUC difference: {auc_difference_percent:.2f}%") + if auc_difference_percent > 5: + logger.warning("AUC difference > 5%. This may indicate overfitting.") + + fig, ax = plt.subplots(1, 3, figsize=(12, 4)) + ax[0].plot(fpr_test, tpr_test, label=f"Test AUC: {auc_test:.3f}") + ax[0].plot(fpr_train, tpr_train, label=f"Train AUC: {auc_train:.3f}") + ax[0].set_xlabel("false positive rate") + ax[0].set_ylabel("true positive rate") + ax[0].legend() + + ax[1].set_xlim(0, 1) + ax[1].hist( + np.concatenate([y_test_proba[y_test == 0], y_train_proba[y_train == 0]]), + bins=50, + alpha=0.5, + label="target", + ) + ax[1].hist( + np.concatenate([y_test_proba[y_test == 1], y_train_proba[y_train == 1]]), + bins=50, + alpha=0.5, + label="decoy", + ) + ax[1].set_xlabel("decoy score") + ax[1].set_ylabel("precursor count") + ax[1].legend() + + qval_plot = qval[qval < 0.05] + ids = np.arange(0, len(qval_plot), 1) + ax[2].plot(qval_plot, ids) + ax[2].set_xlim(-0.001, 0.05) + ax[2].set_xlabel("q-value") + ax[2].set_ylabel("number of precursors") + + for axs in ax: + # remove top and right spines + axs.spines["top"].set_visible(False) + axs.spines["right"].set_visible(False) + axs.get_yaxis().set_major_formatter( + mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ",")) + ) + + fig.tight_layout() + plt.show() + + if figure_path is not None: + fig.savefig(os.path.join(figure_path, "fdr.pdf")) + + if neptune_run is not None: + neptune_run["eval/fdr"].log(fig) + + plt.close() diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index 4e7304f8..0a46993a 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -16,109 +16,7 @@ from torchmetrics.classification import BinaryAUROC from tqdm import tqdm - -def fdr_to_q_values(fdr_values: np.ndarray): - """Converts FDR values to q-values. - Takes a ascending sorted array of FDR values and converts them to q-values. - for every element the lowest FDR where it would be accepted is used as q-value. - - Parameters - ---------- - fdr_values : np.ndarray - The FDR values to convert. - - Returns - ------- - np.ndarray - The q-values. - """ - fdr_values_flipped = np.flip(fdr_values) - q_values_flipped = np.minimum.accumulate(fdr_values_flipped) - q_vals = np.flip(q_values_flipped) - return q_vals - - -def q_values( - scores: np.ndarray, - decoy_labels: np.ndarray, - # score_column : str = 'proba', - # decoy_column : str = '_decoy', - # qval_column : str = 'qval' -): - """Calculates q-values for a dataframe containing PSMs. - - Parameters - ---------- - - _df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str, default='proba' - The name of the column containing the score to use for the selection. - Ascending sorted values are expected. - - decoy_column : str, default='_decoy' - The name of the column containing the decoy information. - Decoys are expected to be 1 and targets 0. - - qval_column : str, default='qval' - The name of the column to store the q-values in. - - Returns - ------- - - pd.DataFrame - The dataframe containing the q-values in column qval. - - """ - - decoy_labels = decoy_labels[scores.argsort()] - target_values = 1 - decoy_labels - decoy_cumsum = np.cumsum(decoy_labels) - target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum / target_cumsum - return fdr_to_q_values(fdr_values) - - -def get_q_values( - _df: pd.DataFrame, - score_column: str = "proba", - decoy_column: str = "_decoy", - qval_column: str = "qval", -): - """Calculates q-values for a dataframe containing PSMs. - - Parameters - ---------- - - _df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str, default='proba' - The name of the column containing the score to use for the selection. - Ascending sorted values are expected. - - decoy_column : str, default='_decoy' - The name of the column containing the decoy information. - Decoys are expected to be 1 and targets 0. - - qval_column : str, default='qval' - The name of the column to store the q-values in. - - Returns - ------- - - pd.DataFrame - The dataframe containing the q-values in column qval. - - """ - _df = _df.sort_values([score_column, score_column], ascending=True) - target_values = 1 - _df[decoy_column].values - decoy_cumsum = np.cumsum(_df[decoy_column].values) - target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum / target_cumsum - _df[qval_column] = fdr_to_q_values(fdr_values) - return _df +from alphadia.fdr_utils import get_q_values, keep_best def apply_absolute_transformations(df: pd.DataFrame) -> pd.DataFrame: @@ -449,42 +347,6 @@ def get_target_decoy_partners(df_sub, df_all): return matching_rows -def keep_best( - df: pd.DataFrame, - score_column: str = "proba", - group_columns: list[str] | None = None, -): - """Keep the best PSM for each group of PSMs with the same precursor_idx. - This function is used to select the best candidate PSM for each precursor. - if the group_columns is set to ['channel', 'elution_group_idx'] then its used for target decoy competition. - - Parameters - ---------- - - df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str - The name of the column containing the score to use for the selection. - - group_columns : list[str], default=['channel', 'precursor_idx'] - The columns to use for the grouping. - - Returns - ------- - - pd.DataFrame - The dataframe containing the best PSM for each group. - """ - if group_columns is None: - group_columns = ["channel", "precursor_idx"] - temp_df = df.reset_index(drop=True) - temp_df = temp_df.sort_values(score_column, ascending=True) - temp_df = temp_df.groupby(group_columns).head(1) - temp_df = temp_df.sort_index().reset_index(drop=True) - return temp_df - - class LogisticRegressionClassifier(Classifier): def __init__(self) -> None: """Linear classifier using a logistic regression model.""" From 124260fa88136bce11fe763b51e9d59db90cf0f8 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 16 Jan 2025 15:33:22 +0100 Subject: [PATCH 05/24] fix logreg initialization --- alphadia/workflow/peptidecentric.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index d6d00b6a..d23f61ad 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -97,9 +97,7 @@ def get_classifier_base(enable_two_step_classifier: bool = False): if enable_two_step_classifier: - first_classifier = fdrx.LogisticRegressionClassifier( - apply_abs_transformations=True - ) + first_classifier = fdrx.LogisticRegressionClassifier() second_classifier = fdrx.BinaryClassifierLegacyNewBatching( test_size=0.001, batch_size=5000, learning_rate=0.001, epochs=10 ) From 6480c6a358ee9306c666b6c426c6d079bf5c0995 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 16 Jan 2025 15:58:42 +0100 Subject: [PATCH 06/24] fix fdr_utils test --- tests/unit_tests/test_fdr.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unit_tests/test_fdr.py b/tests/unit_tests/test_fdr.py index d2aa30db..8a18650a 100644 --- a/tests/unit_tests/test_fdr.py +++ b/tests/unit_tests/test_fdr.py @@ -8,7 +8,7 @@ import pytest import torch -from alphadia import fdr +from alphadia import fdr_utils from alphadia import fdrexperimental as fdrx from alphadia.workflow import manager @@ -68,14 +68,14 @@ def test_keep_best(): } ) - best_df = fdr.keep_best( + best_df = fdr_utils.keep_best( test_df, score_column="proba", group_columns=["precursor_idx"] ) assert best_df.shape[0] == 3 assert np.allclose(best_df["proba"].values, np.array([0.1, 0.4, 0.7])) - best_df = fdr.keep_best( + best_df = fdr_utils.keep_best( test_df, score_column="proba", group_columns=["channel", "precursor_idx"] ) @@ -94,7 +94,7 @@ def test_keep_best_2(): } ) - result_df = fdr.keep_best(test_df, group_columns=["channel", "elution_group_idx"]) + result_df = fdr_utils.keep_best(test_df, group_columns=["channel", "elution_group_idx"]) pd.testing.assert_frame_equal(result_df, test_df) test_df = pd.DataFrame( @@ -104,7 +104,7 @@ def test_keep_best_2(): "proba": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.1, 0.2, 0.3], } ) - result_df = fdr.keep_best(test_df, group_columns=["channel", "elution_group_idx"]) + result_df = fdr_utils.keep_best(test_df, group_columns=["channel", "elution_group_idx"]) result_expected = pd.DataFrame( { "channel": [0, 0, 4, 4, 8, 8], @@ -121,7 +121,7 @@ def test_keep_best_2(): "proba": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.1, 0.2, 0.3], } ) - result_df = fdr.keep_best(test_df, group_columns=["channel", "precursor_idx"]) + result_df = fdr_utils.keep_best(test_df, group_columns=["channel", "precursor_idx"]) result_expected = pd.DataFrame( { "channel": [0, 0, 4, 4, 8, 8], @@ -135,7 +135,7 @@ def test_keep_best_2(): def test_fdr_to_q_values(): test_fdr = np.array([0.2, 0.1, 0.05, 0.3, 0.26, 0.25, 0.5]) - test_q_values = fdr.fdr_to_q_values(test_fdr) + test_q_values = fdr_utils.fdr_to_q_values(test_fdr) assert np.allclose( test_q_values, np.array([0.05, 0.05, 0.05, 0.25, 0.25, 0.25, 0.5]) @@ -151,7 +151,7 @@ def test_get_q_values(): } ) - test_df = fdr.get_q_values(test_df, "proba", "_decoy") + test_df = fdr_utils.get_q_values(test_df, "proba", "_decoy") assert np.allclose( test_df["qval"].values, From 5850360e20b35fa3ba513292e3d5466e3e2d1532 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 16 Jan 2025 16:29:09 +0100 Subject: [PATCH 07/24] fix fdr_utils refactoring --- alphadia/outputtransform.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/alphadia/outputtransform.py b/alphadia/outputtransform.py index d98ad9cd..1f2c6ce6 100644 --- a/alphadia/outputtransform.py +++ b/alphadia/outputtransform.py @@ -17,7 +17,7 @@ from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import StandardScaler -from alphadia import fdr, grouping, libtransform, utils +from alphadia import fdr_utils, grouping, libtransform, utils from alphadia.consensus.utils import read_df, write_df from alphadia.exceptions import NoPsmFoundError from alphadia.outputaccumulator import ( @@ -1162,7 +1162,7 @@ def perform_protein_fdr(psm_df): protein_features["proba"] = clf.predict_proba(scaler.transform(X))[:, 1] protein_features = pd.DataFrame(protein_features) - protein_features = fdr.get_q_values( + protein_features = fdr_utils.get_q_values( protein_features, score_column="proba", decoy_column="decoy", @@ -1178,7 +1178,9 @@ def perform_protein_fdr(psm_df): protein_features["pg_qval"] = protein_features["pg_qval"] * n_targets / n_decoys - fdr.plot_fdr(X_train, X_test, y_train, y_test, clf, protein_features["pg_qval"]) + fdr_utils.plot_fdr( + X_train, X_test, y_train, y_test, clf, protein_features["pg_qval"] + ) return pd.concat( [ From e8fcc3fd18d176391bbd53ac888956cc29bce757 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 08:38:24 +0100 Subject: [PATCH 08/24] remove redundant perform_fdr_new function --- alphadia/fdr.py | 16 ------ alphadia/fdrexperimental.py | 103 +++++++++-------------------------- alphadia/workflow/manager.py | 16 ++++-- 3 files changed, 39 insertions(+), 96 deletions(-) diff --git a/alphadia/fdr.py b/alphadia/fdr.py index 85a78de7..76b36f31 100644 --- a/alphadia/fdr.py +++ b/alphadia/fdr.py @@ -167,19 +167,3 @@ def perform_fdr( return psm_df - -def perform_fdr_new( - classifier: fdrx.Classifier, - available_columns: list[str], - df: pd.DataFrame, - group_columns, - **kwargs, -): - df.dropna(subset=available_columns, inplace=True) - psm_df = classifier.fit_predict( - df, - available_columns + ["score"], - "decoy", - group_columns, - ) - return psm_df diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index 0a46993a..61993193 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -16,7 +16,7 @@ from torchmetrics.classification import BinaryAUROC from tqdm import tqdm -from alphadia.fdr_utils import get_q_values, keep_best +from alphadia.fdr import get_q_values, keep_best def apply_absolute_transformations(df: pd.DataFrame) -> pd.DataFrame: @@ -125,7 +125,7 @@ def from_state_dict(self, state_dict: dict): """ -class TwoStepClassifier(Classifier): +class TwoStepClassifier: def __init__( self, first_classifier: Classifier, @@ -147,22 +147,22 @@ def fit_predict( self, df_: pd.DataFrame, x_cols: list[str], - y_col: str, + y_col: str = "decoy", group_columns: list[str] | None = None, ) -> pd.DataFrame: """ Return dataframe including only the found precursors. """ df = df_.copy() + df.dropna(subset=x_cols, inplace=True) df = apply_absolute_transformations(df) - if self.first_classifier.fitted: df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[ :, 1 ] df_subset = get_entries_below_fdr( df, self.first_fdr, group_columns, remove_decoys=False - ) + ) # TODO print( f"After q-val extraction, after LinClassifier (first_fdr={self.first_fdr}): {df_subset.shape}" ) @@ -180,8 +180,8 @@ def fit_predict( else: df_train = df[df["rank"] < self.train_on_top_n] self.second_classifier.fit( - df_train[x_cols].to_numpy(), - df_train[y_col].to_numpy(), + df_train[x_cols].to_numpy().astype(np.float32), + df_train[y_col].to_numpy().astype(np.float32), ) df_subset = df @@ -189,15 +189,16 @@ def fit_predict( df_subset["proba"] = self.second_classifier.predict_proba(x_subset)[:, 1] df_subset = get_entries_below_fdr( - df_subset, self.second_fdr, group_columns - ) # , remove_decoys=True) - print( - f"After q-val extraction, after NN (second_fdr={self.second_fdr}): {df_subset.shape}" + df_subset, self.second_fdr, group_columns, remove_decoys=False ) - - df_subset_2 = get_target_decoy_partners(df_subset, df_) - df_targets = df_subset_2[df_subset_2["decoy"] == 0] - + # print( + # f"After q-val extraction, after NN (second_fdr={self.second_fdr}): {df_subset.shape}" + # ) + df_targets = df_subset[ + df_subset["decoy"] == 0 + ] # TODO df_targets does not have q values now. + + df_subset_2 = get_target_decoy_partners(df_subset, df) self._update_classifier( self.first_classifier, df_subset_2, @@ -211,13 +212,12 @@ def fit_predict( @classmethod def _update_classifier( - cls, classifier, df_, x_cols, y_col, fdr, group_columns + cls, classifier, df, x_cols, y_col, fdr, group_columns ) -> None: - X = df_[x_cols] - y = df_[y_col] - df = df_.copy() + X = df[x_cols].to_numpy() + y = df[y_col].to_numpy() if hasattr(classifier, "fitted") and classifier.fitted: - df["proba"] = classifier.predict_proba(df[x_cols].to_numpy())[:, 1] + df["proba"] = classifier.predict_proba(X)[:, 1] df_targets = get_entries_below_fdr(df, fdr, group_columns) previous_n_precursors = len(df_targets) saved_state_dict = classifier.to_state_dict() @@ -227,7 +227,7 @@ def _update_classifier( classifier.fit(X, y) classifier._fitted = True - df["proba"] = classifier.predict_proba(df[x_cols].to_numpy())[:, 1] + df["proba"] = classifier.predict_proba(X)[:, 1] df_targets = get_entries_below_fdr(df, fdr, group_columns) current_n_precursors = len(df_targets) @@ -242,61 +242,6 @@ def fitted(self) -> bool: """Return whether both classifiers have been fitted.""" return self.second_classifier.fitted - def fit(self, x: np.ndarray, y: np.ndarray) -> None: - """Fit both classifiers sequentially. - - Parameters - ---------- - x : np.ndarray - Training data - y : np.ndarray - Target values - """ - # First classifier fit - # self.first_classifier.fit(x, y) - - # # Get predictions from first classifier - # probs = self.first_classifier.predict_proba(x)[:, 1] - - # # Filter data based on first FDR threshold - # mask = probs >= (1 - self.first_fdr) - # x_filtered = x[mask] - # y_filtered = y[mask] - - # # Fit second classifier on filtered data - # self.second_classifier.fit(x_filtered, y_filtered) - pass - - def predict(self, x: np.ndarray) -> np.ndarray: - """Predict class labels using both classifiers. - - Parameters - ---------- - x : np.ndarray - Input data - - Returns - ------- - np.ndarray - Predicted class labels - """ - return np.argmax(self.predict_proba(x), axis=1) - - def predict_proba(self, x: np.ndarray) -> np.ndarray: - """Predict class probabilities using both classifiers. - - Parameters - ---------- - x : np.ndarray - Input data - - Returns - ------- - np.ndarray - Predicted class probabilities - """ - pass - def to_state_dict(self) -> dict: """Save classifier state. @@ -336,6 +281,11 @@ def get_entries_below_fdr(df, fdr, group_columns, remove_decoys: bool = True): df_subset = df[df["qval"] < fdr] if remove_decoys: df_subset = df_subset[df_subset["decoy"] == 0] + + if len(df_subset) == 0: + df = df[df["decoy"] == 0] + df_subset = df.loc[df["qval"].idxmin()] + return df_subset @@ -1486,6 +1436,7 @@ def fit(self, x: np.ndarray, y: np.ndarray): x_train_batch = x_train[batch_start:batch_stop] y_train_batch = y_train[batch_start:batch_stop] y_pred = self.network(x_train_batch) + loss_value = loss(y_pred, y_train_batch) self.network.zero_grad() diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 3ff37540..606d0408 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -1,4 +1,5 @@ # native imports +import contextlib import logging import os import pickle @@ -17,6 +18,7 @@ # alphadia imports import alphadia +import alphadia.fdrexperimental as fdrx from alphadia import fdr from alphadia.calibration.property import Calibration, calibration_model_provider from alphadia.workflow import reporting @@ -554,7 +556,9 @@ def __init__( self.feature_columns = feature_columns self.classifier_store = defaultdict(list) self.classifier_base = classifier_base - self.enable_two_step_classifier = enable_two_step_classifier + self.enable_two_step_classifier = isinstance( + classifier_base, fdrx.TwoStepClassifier + ) self._current_version = -1 self.load_classifier_store() @@ -638,8 +642,11 @@ def fit_predict( ) else: group_columns = get_group_columns(competetive, group_channels=True) - psm_df = fdr.perform_fdr_new( - classifier, available_columns, features_df, group_columns + + psm_df = classifier.fit_predict( + features_df, + available_columns + ["score"], + group_columns=group_columns, ) elif decoy_strategy == "precursor_channel_wise": @@ -743,7 +750,8 @@ def load_classifier_store(self, path: None | str = None): if classifier_hash not in self.classifier_store: classifier = deepcopy(self.classifier_base) - classifier.from_state_dict(torch.load(os.path.join(path, file))) + with contextlib.suppress(Exception): + classifier.from_state_dict(torch.load(os.path.join(path, file))) self.classifier_store[classifier_hash].append(classifier) def get_classifier(self, available_columns: list, version: int = -1): From 052c1096ece2bf19ccc394dc7714287fbe55b704 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 08:39:43 +0100 Subject: [PATCH 09/24] revert fdr_utils changes --- alphadia/fdr.py | 252 ++++++++++++++++++++++++++++++++- alphadia/fdr_utils.py | 260 ----------------------------------- alphadia/outputtransform.py | 8 +- tests/unit_tests/test_fdr.py | 16 +-- 4 files changed, 260 insertions(+), 276 deletions(-) delete mode 100644 alphadia/fdr_utils.py diff --git a/alphadia/fdr.py b/alphadia/fdr.py index 76b36f31..5c9911cb 100644 --- a/alphadia/fdr.py +++ b/alphadia/fdr.py @@ -1,18 +1,18 @@ # native imports import logging +import os +import matplotlib as mpl +import matplotlib.pyplot as plt import numpy as np # third party imports import pandas as pd import sklearn -import alphadia.fdrexperimental as fdrx - # alphadia imports # alpha family imports from alphadia import fragcomp -from alphadia.fdr_utils import get_q_values, keep_best, plot_fdr logger = logging.getLogger() @@ -167,3 +167,249 @@ def perform_fdr( return psm_df + +def keep_best( + df: pd.DataFrame, + score_column: str = "proba", + group_columns: list[str] | None = None, +): + """Keep the best PSM for each group of PSMs with the same precursor_idx. + This function is used to select the best candidate PSM for each precursor. + if the group_columns is set to ['channel', 'elution_group_idx'] then its used for target decoy competition. + + Parameters + ---------- + + df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str + The name of the column containing the score to use for the selection. + + group_columns : list[str], default=['channel', 'precursor_idx'] + The columns to use for the grouping. + + Returns + ------- + + pd.DataFrame + The dataframe containing the best PSM for each group. + """ + if group_columns is None: + group_columns = ["channel", "precursor_idx"] + temp_df = df.reset_index(drop=True) + temp_df = temp_df.sort_values(score_column, ascending=True) + temp_df = temp_df.groupby(group_columns).head(1) + temp_df = temp_df.sort_index().reset_index(drop=True) + return temp_df + + +def fdr_to_q_values(fdr_values: np.ndarray): + """Converts FDR values to q-values. + Takes a ascending sorted array of FDR values and converts them to q-values. + for every element the lowest FDR where it would be accepted is used as q-value. + + Parameters + ---------- + fdr_values : np.ndarray + The FDR values to convert. + + Returns + ------- + np.ndarray + The q-values. + """ + fdr_values_flipped = np.flip(fdr_values) + q_values_flipped = np.minimum.accumulate(fdr_values_flipped) + q_vals = np.flip(q_values_flipped) + return q_vals + + +def q_values( + scores: np.ndarray, + decoy_labels: np.ndarray, + # score_column : str = 'proba', + # decoy_column : str = '_decoy', + # qval_column : str = 'qval' +): + """Calculates q-values for a dataframe containing PSMs. + + Parameters + ---------- + + _df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str, default='proba' + The name of the column containing the score to use for the selection. + Ascending sorted values are expected. + + decoy_column : str, default='_decoy' + The name of the column containing the decoy information. + Decoys are expected to be 1 and targets 0. + + qval_column : str, default='qval' + The name of the column to store the q-values in. + + Returns + ------- + + pd.DataFrame + The dataframe containing the q-values in column qval. + + """ + + decoy_labels = decoy_labels[scores.argsort()] + target_values = 1 - decoy_labels + decoy_cumsum = np.cumsum(decoy_labels) + target_cumsum = np.cumsum(target_values) + fdr_values = decoy_cumsum / target_cumsum + return fdr_to_q_values(fdr_values) + + +def get_q_values( + _df: pd.DataFrame, + score_column: str = "proba", + decoy_column: str = "_decoy", + qval_column: str = "qval", +): + """Calculates q-values for a dataframe containing PSMs. + + Parameters + ---------- + + _df : pd.DataFrame + The dataframe containing the PSMs. + + score_column : str, default='proba' + The name of the column containing the score to use for the selection. + Ascending sorted values are expected. + + decoy_column : str, default='_decoy' + The name of the column containing the decoy information. + Decoys are expected to be 1 and targets 0. + + qval_column : str, default='qval' + The name of the column to store the q-values in. + + Returns + ------- + + pd.DataFrame + The dataframe containing the q-values in column qval. + + """ + _df = _df.sort_values([score_column, score_column], ascending=True) + target_values = 1 - _df[decoy_column].values + decoy_cumsum = np.cumsum(_df[decoy_column].values) + target_cumsum = np.cumsum(target_values) + fdr_values = decoy_cumsum / target_cumsum + _df[qval_column] = fdr_to_q_values(fdr_values) + return _df + + +def plot_fdr( + X_train: np.ndarray, + X_test: np.ndarray, + y_train: np.ndarray, + y_test: np.ndarray, + classifier: sklearn.base.BaseEstimator, + qval: np.ndarray, + figure_path: str | None = None, + neptune_run=None, +): + """Plots statistics on the fdr corrected PSMs. + + Parameters + ---------- + + X_train : np.ndarray + The training data. + + X_test : np.ndarray + The test data. + + y_train : np.ndarray + The training labels. + + y_test : np.ndarray + The test labels. + + classifier : sklearn.base.BaseEstimator + The classifier used for the prediction. + + qval : np.ndarray + The q-values of the PSMs. + """ + + y_test_proba = classifier.predict_proba(X_test)[:, 1] + + y_train_proba = classifier.predict_proba(X_train)[:, 1] + + fpr_test, tpr_test, thresholds_test = sklearn.metrics.roc_curve( + y_test, y_test_proba + ) + fpr_train, tpr_train, thresholds_train = sklearn.metrics.roc_curve( + y_train, y_train_proba + ) + + auc_test = sklearn.metrics.auc(fpr_test, tpr_test) + auc_train = sklearn.metrics.auc(fpr_train, tpr_train) + + logger.info(f"Test AUC: {auc_test:.3f}") + logger.info(f"Train AUC: {auc_train:.3f}") + + auc_difference_percent = np.abs((auc_test - auc_train) / auc_train * 100) + logger.info(f"AUC difference: {auc_difference_percent:.2f}%") + if auc_difference_percent > 5: + logger.warning("AUC difference > 5%. This may indicate overfitting.") + + fig, ax = plt.subplots(1, 3, figsize=(12, 4)) + ax[0].plot(fpr_test, tpr_test, label=f"Test AUC: {auc_test:.3f}") + ax[0].plot(fpr_train, tpr_train, label=f"Train AUC: {auc_train:.3f}") + ax[0].set_xlabel("false positive rate") + ax[0].set_ylabel("true positive rate") + ax[0].legend() + + ax[1].set_xlim(0, 1) + ax[1].hist( + np.concatenate([y_test_proba[y_test == 0], y_train_proba[y_train == 0]]), + bins=50, + alpha=0.5, + label="target", + ) + ax[1].hist( + np.concatenate([y_test_proba[y_test == 1], y_train_proba[y_train == 1]]), + bins=50, + alpha=0.5, + label="decoy", + ) + ax[1].set_xlabel("decoy score") + ax[1].set_ylabel("precursor count") + ax[1].legend() + + qval_plot = qval[qval < 0.05] + ids = np.arange(0, len(qval_plot), 1) + ax[2].plot(qval_plot, ids) + ax[2].set_xlim(-0.001, 0.05) + ax[2].set_xlabel("q-value") + ax[2].set_ylabel("number of precursors") + + for axs in ax: + # remove top and right spines + axs.spines["top"].set_visible(False) + axs.spines["right"].set_visible(False) + axs.get_yaxis().set_major_formatter( + mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ",")) + ) + + fig.tight_layout() + plt.show() + + if figure_path is not None: + fig.savefig(os.path.join(figure_path, "fdr.pdf")) + + if neptune_run is not None: + neptune_run["eval/fdr"].log(fig) + + plt.close() diff --git a/alphadia/fdr_utils.py b/alphadia/fdr_utils.py deleted file mode 100644 index 0c677bd1..00000000 --- a/alphadia/fdr_utils.py +++ /dev/null @@ -1,260 +0,0 @@ -# native imports -import logging -import os - -import matplotlib as mpl -import matplotlib.pyplot as plt -import numpy as np - -# third party imports -import pandas as pd -import sklearn - -logger = logging.getLogger() - - -def keep_best( - df: pd.DataFrame, - score_column: str = "proba", - group_columns: list[str] | None = None, -): - """Keep the best PSM for each group of PSMs with the same precursor_idx. - This function is used to select the best candidate PSM for each precursor. - if the group_columns is set to ['channel', 'elution_group_idx'] then its used for target decoy competition. - - Parameters - ---------- - - df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str - The name of the column containing the score to use for the selection. - - group_columns : list[str], default=['channel', 'precursor_idx'] - The columns to use for the grouping. - - Returns - ------- - - pd.DataFrame - The dataframe containing the best PSM for each group. - """ - if group_columns is None: - group_columns = ["channel", "precursor_idx"] - temp_df = df.reset_index(drop=True) - temp_df = temp_df.sort_values(score_column, ascending=True) - temp_df = temp_df.groupby(group_columns).head(1) - temp_df = temp_df.sort_index().reset_index(drop=True) - return temp_df - - -def fdr_to_q_values(fdr_values: np.ndarray): - """Converts FDR values to q-values. - Takes a ascending sorted array of FDR values and converts them to q-values. - for every element the lowest FDR where it would be accepted is used as q-value. - - Parameters - ---------- - fdr_values : np.ndarray - The FDR values to convert. - - Returns - ------- - np.ndarray - The q-values. - """ - fdr_values_flipped = np.flip(fdr_values) - q_values_flipped = np.minimum.accumulate(fdr_values_flipped) - q_vals = np.flip(q_values_flipped) - return q_vals - - -def q_values( - scores: np.ndarray, - decoy_labels: np.ndarray, - # score_column : str = 'proba', - # decoy_column : str = '_decoy', - # qval_column : str = 'qval' -): - """Calculates q-values for a dataframe containing PSMs. - - Parameters - ---------- - - _df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str, default='proba' - The name of the column containing the score to use for the selection. - Ascending sorted values are expected. - - decoy_column : str, default='_decoy' - The name of the column containing the decoy information. - Decoys are expected to be 1 and targets 0. - - qval_column : str, default='qval' - The name of the column to store the q-values in. - - Returns - ------- - - pd.DataFrame - The dataframe containing the q-values in column qval. - - """ - - decoy_labels = decoy_labels[scores.argsort()] - target_values = 1 - decoy_labels - decoy_cumsum = np.cumsum(decoy_labels) - target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum / target_cumsum - return fdr_to_q_values(fdr_values) - - -def get_q_values( - _df: pd.DataFrame, - score_column: str = "proba", - decoy_column: str = "_decoy", - qval_column: str = "qval", -): - """Calculates q-values for a dataframe containing PSMs. - - Parameters - ---------- - - _df : pd.DataFrame - The dataframe containing the PSMs. - - score_column : str, default='proba' - The name of the column containing the score to use for the selection. - Ascending sorted values are expected. - - decoy_column : str, default='_decoy' - The name of the column containing the decoy information. - Decoys are expected to be 1 and targets 0. - - qval_column : str, default='qval' - The name of the column to store the q-values in. - - Returns - ------- - - pd.DataFrame - The dataframe containing the q-values in column qval. - - """ - _df = _df.sort_values([score_column, score_column], ascending=True) - target_values = 1 - _df[decoy_column].values - decoy_cumsum = np.cumsum(_df[decoy_column].values) - target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum / target_cumsum - _df[qval_column] = fdr_to_q_values(fdr_values) - return _df - - -def plot_fdr( - X_train: np.ndarray, - X_test: np.ndarray, - y_train: np.ndarray, - y_test: np.ndarray, - classifier: sklearn.base.BaseEstimator, - qval: np.ndarray, - figure_path: str | None = None, - neptune_run=None, -): - """Plots statistics on the fdr corrected PSMs. - - Parameters - ---------- - - X_train : np.ndarray - The training data. - - X_test : np.ndarray - The test data. - - y_train : np.ndarray - The training labels. - - y_test : np.ndarray - The test labels. - - classifier : sklearn.base.BaseEstimator - The classifier used for the prediction. - - qval : np.ndarray - The q-values of the PSMs. - """ - - y_test_proba = classifier.predict_proba(X_test)[:, 1] - - y_train_proba = classifier.predict_proba(X_train)[:, 1] - - fpr_test, tpr_test, thresholds_test = sklearn.metrics.roc_curve( - y_test, y_test_proba - ) - fpr_train, tpr_train, thresholds_train = sklearn.metrics.roc_curve( - y_train, y_train_proba - ) - - auc_test = sklearn.metrics.auc(fpr_test, tpr_test) - auc_train = sklearn.metrics.auc(fpr_train, tpr_train) - - logger.info(f"Test AUC: {auc_test:.3f}") - logger.info(f"Train AUC: {auc_train:.3f}") - - auc_difference_percent = np.abs((auc_test - auc_train) / auc_train * 100) - logger.info(f"AUC difference: {auc_difference_percent:.2f}%") - if auc_difference_percent > 5: - logger.warning("AUC difference > 5%. This may indicate overfitting.") - - fig, ax = plt.subplots(1, 3, figsize=(12, 4)) - ax[0].plot(fpr_test, tpr_test, label=f"Test AUC: {auc_test:.3f}") - ax[0].plot(fpr_train, tpr_train, label=f"Train AUC: {auc_train:.3f}") - ax[0].set_xlabel("false positive rate") - ax[0].set_ylabel("true positive rate") - ax[0].legend() - - ax[1].set_xlim(0, 1) - ax[1].hist( - np.concatenate([y_test_proba[y_test == 0], y_train_proba[y_train == 0]]), - bins=50, - alpha=0.5, - label="target", - ) - ax[1].hist( - np.concatenate([y_test_proba[y_test == 1], y_train_proba[y_train == 1]]), - bins=50, - alpha=0.5, - label="decoy", - ) - ax[1].set_xlabel("decoy score") - ax[1].set_ylabel("precursor count") - ax[1].legend() - - qval_plot = qval[qval < 0.05] - ids = np.arange(0, len(qval_plot), 1) - ax[2].plot(qval_plot, ids) - ax[2].set_xlim(-0.001, 0.05) - ax[2].set_xlabel("q-value") - ax[2].set_ylabel("number of precursors") - - for axs in ax: - # remove top and right spines - axs.spines["top"].set_visible(False) - axs.spines["right"].set_visible(False) - axs.get_yaxis().set_major_formatter( - mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ",")) - ) - - fig.tight_layout() - plt.show() - - if figure_path is not None: - fig.savefig(os.path.join(figure_path, "fdr.pdf")) - - if neptune_run is not None: - neptune_run["eval/fdr"].log(fig) - - plt.close() diff --git a/alphadia/outputtransform.py b/alphadia/outputtransform.py index 1f2c6ce6..d98ad9cd 100644 --- a/alphadia/outputtransform.py +++ b/alphadia/outputtransform.py @@ -17,7 +17,7 @@ from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import StandardScaler -from alphadia import fdr_utils, grouping, libtransform, utils +from alphadia import fdr, grouping, libtransform, utils from alphadia.consensus.utils import read_df, write_df from alphadia.exceptions import NoPsmFoundError from alphadia.outputaccumulator import ( @@ -1162,7 +1162,7 @@ def perform_protein_fdr(psm_df): protein_features["proba"] = clf.predict_proba(scaler.transform(X))[:, 1] protein_features = pd.DataFrame(protein_features) - protein_features = fdr_utils.get_q_values( + protein_features = fdr.get_q_values( protein_features, score_column="proba", decoy_column="decoy", @@ -1178,9 +1178,7 @@ def perform_protein_fdr(psm_df): protein_features["pg_qval"] = protein_features["pg_qval"] * n_targets / n_decoys - fdr_utils.plot_fdr( - X_train, X_test, y_train, y_test, clf, protein_features["pg_qval"] - ) + fdr.plot_fdr(X_train, X_test, y_train, y_test, clf, protein_features["pg_qval"]) return pd.concat( [ diff --git a/tests/unit_tests/test_fdr.py b/tests/unit_tests/test_fdr.py index 8a18650a..d2aa30db 100644 --- a/tests/unit_tests/test_fdr.py +++ b/tests/unit_tests/test_fdr.py @@ -8,7 +8,7 @@ import pytest import torch -from alphadia import fdr_utils +from alphadia import fdr from alphadia import fdrexperimental as fdrx from alphadia.workflow import manager @@ -68,14 +68,14 @@ def test_keep_best(): } ) - best_df = fdr_utils.keep_best( + best_df = fdr.keep_best( test_df, score_column="proba", group_columns=["precursor_idx"] ) assert best_df.shape[0] == 3 assert np.allclose(best_df["proba"].values, np.array([0.1, 0.4, 0.7])) - best_df = fdr_utils.keep_best( + best_df = fdr.keep_best( test_df, score_column="proba", group_columns=["channel", "precursor_idx"] ) @@ -94,7 +94,7 @@ def test_keep_best_2(): } ) - result_df = fdr_utils.keep_best(test_df, group_columns=["channel", "elution_group_idx"]) + result_df = fdr.keep_best(test_df, group_columns=["channel", "elution_group_idx"]) pd.testing.assert_frame_equal(result_df, test_df) test_df = pd.DataFrame( @@ -104,7 +104,7 @@ def test_keep_best_2(): "proba": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.1, 0.2, 0.3], } ) - result_df = fdr_utils.keep_best(test_df, group_columns=["channel", "elution_group_idx"]) + result_df = fdr.keep_best(test_df, group_columns=["channel", "elution_group_idx"]) result_expected = pd.DataFrame( { "channel": [0, 0, 4, 4, 8, 8], @@ -121,7 +121,7 @@ def test_keep_best_2(): "proba": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.1, 0.2, 0.3], } ) - result_df = fdr_utils.keep_best(test_df, group_columns=["channel", "precursor_idx"]) + result_df = fdr.keep_best(test_df, group_columns=["channel", "precursor_idx"]) result_expected = pd.DataFrame( { "channel": [0, 0, 4, 4, 8, 8], @@ -135,7 +135,7 @@ def test_keep_best_2(): def test_fdr_to_q_values(): test_fdr = np.array([0.2, 0.1, 0.05, 0.3, 0.26, 0.25, 0.5]) - test_q_values = fdr_utils.fdr_to_q_values(test_fdr) + test_q_values = fdr.fdr_to_q_values(test_fdr) assert np.allclose( test_q_values, np.array([0.05, 0.05, 0.05, 0.25, 0.25, 0.25, 0.5]) @@ -151,7 +151,7 @@ def test_get_q_values(): } ) - test_df = fdr_utils.get_q_values(test_df, "proba", "_decoy") + test_df = fdr.get_q_values(test_df, "proba", "_decoy") assert np.allclose( test_df["qval"].values, From 9df38a8984dae70643af0504a84840829b8155ea Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 10:13:45 +0100 Subject: [PATCH 10/24] clean up and add docstrings --- alphadia/fdrexperimental.py | 179 +++++++++++++++++++++++++++--------- 1 file changed, 138 insertions(+), 41 deletions(-) diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index 61993193..b45bc298 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -1,4 +1,5 @@ # native imports +import logging import warnings from abc import ABC, abstractmethod from copy import deepcopy @@ -18,18 +19,39 @@ from alphadia.fdr import get_q_values, keep_best +logger = logging.getLogger() -def apply_absolute_transformations(df: pd.DataFrame) -> pd.DataFrame: - df_transformed = df.copy() - df_transformed["delta_rt"] = np.abs(df_transformed["delta_rt"]) - df_transformed["top_3_ms2_mass_error"] = np.abs( - df_transformed["top_3_ms2_mass_error"] - ) - df_transformed["mean_ms2_mass_error"] = np.abs( - df_transformed["mean_ms2_mass_error"] - ) - return df_transformed +def apply_absolute_transformations( + df: pd.DataFrame, columns: list[str] | None = None +) -> pd.DataFrame: + """ + Applies absolute value transformations to predefined columns in a DataFrame inplace. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing the data to be transformed. + columns : list of str, optional + List of column names to transform. Defaults to ['delta_rt', 'top_3_ms2_mass_error', 'mean_ms2_mass_error']. + + Returns + ------- + pd.DataFrame + The transformed DataFrame. + """ + if columns is None: + columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"] + + for col in columns: + if col in df.columns: + df[col] = np.abs(df[col]) + else: + logger.warning( + f"column '{col}' is not present in df, therefore abs() was not applied." + ) + + return df class Classifier(ABC): @@ -131,15 +153,30 @@ def __init__( first_classifier: Classifier, second_classifier: Classifier, train_on_top_n: int = 1, - first_fdr: float = 0.6, - second_fdr: float = 0.01, - **kwargs, + first_fdr_cutoff: float = 0.6, + second_fdr_cutoff: float = 0.01, ): - super().__init__() + """ + A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage. + + Parameters + ---------- + first_classifier : Classifier + The first classifier used to initially filter the data. + second_classifier : Classifier + The second classifier used to further refine or confirm the classification based on the output from the first classifier. + train_on_top_n : int, default=1 + The number of top candidates that are considered for training of the second classifier. + first_fdr_cutoff : float, default=0.6 + The fdr threshold for the first classifier, determining how selective the first classification step is. + second_fdr_cutoff : float, default=0.01 + The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results. + + """ self.first_classifier = first_classifier self.second_classifier = second_classifier - self.first_fdr = first_fdr - self.second_fdr = second_fdr + self.first_fdr_cutoff = first_fdr_cutoff + self.second_fdr_cutoff = second_fdr_cutoff self.train_on_top_n = train_on_top_n @@ -151,20 +188,35 @@ def fit_predict( group_columns: list[str] | None = None, ) -> pd.DataFrame: """ - Return dataframe including only the found precursors. + Train the two-step classifier and predict resulting precursors, returning a DataFrame of only the predicted precursors. + + Parameters + ---------- + df_ : pd.DataFrame + The input DataFrame from which predictions are to be made. + x_cols : list[str] + List of column names representing the features to be used for prediction. + y_col : str, optional + The name of the column that denotes the target variable, by default 'decoy'. + group_columns : list[str] | None, optional + List of column names to group by for fdr calculations;. If None, fdr calculations will not be grouped. + + Returns + ------- + pd.DataFrame + A DataFrame containing only the predicted precursors. + """ df = df_.copy() df.dropna(subset=x_cols, inplace=True) df = apply_absolute_transformations(df) + if self.first_classifier.fitted: - df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[ - :, 1 - ] + X = df[x_cols].to_numpy() + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_subset = get_entries_below_fdr( - df, self.first_fdr, group_columns, remove_decoys=False - ) # TODO - print( - f"After q-val extraction, after LinClassifier (first_fdr={self.first_fdr}): {df_subset.shape}" + df, self.first_fdr_cutoff, group_columns, remove_decoys=False ) self.second_classifier.batch_size = 500 @@ -189,10 +241,10 @@ def fit_predict( df_subset["proba"] = self.second_classifier.predict_proba(x_subset)[:, 1] df_subset = get_entries_below_fdr( - df_subset, self.second_fdr, group_columns, remove_decoys=False + df_subset, self.second_fdr_cutoff, group_columns, remove_decoys=False ) # print( - # f"After q-val extraction, after NN (second_fdr={self.second_fdr}): {df_subset.shape}" + # f"After q-val extraction, after NN (second_fdr={self.second_fdr_cutoff}): {df_subset.shape}" # ) df_targets = df_subset[ df_subset["decoy"] == 0 @@ -204,7 +256,7 @@ def fit_predict( df_subset_2, x_cols, y_col, - self.first_fdr, + self.first_fdr_cutoff, group_columns, ) @@ -253,8 +305,8 @@ def to_state_dict(self) -> dict: return { "first_classifier": self.first_classifier.to_state_dict(), "second_classifier": self.second_classifier.to_state_dict(), - "first_fdr": self.first_fdr, - "second_fdr": self.second_fdr, + "first_fdr_cutoff": self.first_fdr_cutoff, + "second_fdr_cutoff": self.second_fdr_cutoff, "train_on_top_n": self.train_on_top_n, } @@ -268,12 +320,34 @@ def from_state_dict(self, state_dict: dict) -> None: """ self.first_classifier.from_state_dict(state_dict["first_classifier"]) self.second_classifier.from_state_dict(state_dict["second_classifier"]) - self.first_fdr = state_dict["first_fdr"] - self.second_fdr = state_dict["second_fdr"] + self.first_fdr_cutoff = state_dict["first_fdr_cutoff"] + self.second_fdr_cutoff = state_dict["second_fdr_cutoff"] self.train_on_top_n = state_dict["train_on_top_n"] -def get_entries_below_fdr(df, fdr, group_columns, remove_decoys: bool = True): +def get_entries_below_fdr( + df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True +) -> pd.DataFrame: + """ + Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries. + If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing the columns 'proba', 'decoy', and any specified group columns. + fdr : float + The false discovery rate threshold for filtering entries. + group_columns : list + List of columns to group by when determining the best entries per group. + remove_decoys : bool, optional + Specifies whether decoy entries should be removed from the final result. Defaults to True. + + Returns + ------- + pd.DataFrame + A DataFrame containing entries below the specified FDR threshold, optionally excluding decoys. + """ df.sort_values("proba", ascending=True, inplace=True) df = keep_best(df, group_columns=group_columns) df = get_q_values(df, "proba", "decoy") @@ -282,24 +356,47 @@ def get_entries_below_fdr(df, fdr, group_columns, remove_decoys: bool = True): if remove_decoys: df_subset = df_subset[df_subset["decoy"] == 0] + # Handle case where no entries are below the FDR threshold if len(df_subset) == 0: df = df[df["decoy"] == 0] - df_subset = df.loc[df["qval"].idxmin()] + df_subset = df.loc[[df["qval"].idxmin()]] return df_subset -def get_target_decoy_partners(df_sub, df_all): - group_by = ["rank", "elution_group_idx"] - valid_tuples = df_sub[group_by] - matching_rows = df_all.merge(valid_tuples, on=group_by, how="inner") +def get_target_decoy_partners( + reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None +) -> pd.DataFrame: + """ + Identifies and returns the corresponding target and decoy wartner rows in full_df given the subset reference_df/ + This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index. + + Parameters + ---------- + reference_df : pd.DataFrame + A subset DataFrame that contains reference values for matching. + full_df : pd.DataFrame + The main DataFrame from which rows will be matched against reference_df. + group_by : list[str] | None, optional + The columns to group by when performing the match. Defaults to ['rank', 'elution_group_idx'] if None is provided. + + Returns + ------- + pd.DataFrame + A DataFrame containing rows from full_df that match the grouping criteria. + + """ + if group_by is None: + group_by = ["rank", "elution_group_idx"] + valid_tuples = reference_df[group_by] + matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner") return matching_rows class LogisticRegressionClassifier(Classifier): def __init__(self) -> None: - """Linear classifier using a logistic regression model.""" + """Binary classifier using a logistic regression model.""" self.scaler = StandardScaler() self.model = LogisticRegression() self._fitted = False @@ -326,7 +423,7 @@ def fit(self, x: np.ndarray, y: np.ndarray) -> None: self._fitted = True def predict(self, x: np.ndarray) -> np.ndarray: - """Predict the class probabilities of the data. + """Predict the class of the data. Parameters ---------- @@ -364,7 +461,7 @@ def predict_proba(self, x: np.ndarray) -> np.ndarray: return self.model.predict_proba(x_scaled) def to_state_dict(self) -> dict: - """Save the state of the classifier as a dictionary. + """Return the state of the classifier as a dictionary. Returns ------- @@ -391,7 +488,7 @@ def to_state_dict(self) -> dict: return state_dict - def from_state_dict(self, state_dict: dict): + def from_state_dict(self, state_dict: dict) -> None: """Load the state of the classifier from a dictionary. Parameters From 9e8c0c813e0265cdb909fd81d278668ee4e4b58a Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 10:45:08 +0100 Subject: [PATCH 11/24] add missing docstring --- alphadia/workflow/manager.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 606d0408..265d080a 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -516,7 +516,23 @@ def fit_predict(self, update_dict): return self.predict() -def get_group_columns(competetive, group_channels): +def get_group_columns(competetive: bool, group_channels: bool) -> list[str]: + """ + Determine the group columns based on competitiveness and channel grouping. + + competitive : bool + If True, group candidates eluting at the same time by grouping them under the same 'elution_group_idx'. + group_channels : bool + If True and 'competitive' is also True, further groups candidates by 'channel'. + + Returns + ------- + list + A list of column names to be used for grouping in the analysis. If competitive, this could be either + ['elution_group_idx', 'channel'] or ['elution_group_idx'] depending on the `group_channels` flag. + If not competitive, the list will always be ['precursor_idx']. + + """ if competetive: group_columns = ( ["elution_group_idx", "channel"] From 1605cfa5d6e4e7ae4365ccb3a74e050a3683b833 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 12:24:14 +0100 Subject: [PATCH 12/24] clean up fdrexperimental.py --- alphadia/fdrexperimental.py | 106 ++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index b45bc298..978d5e47 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -182,7 +182,7 @@ def __init__( def fit_predict( self, - df_: pd.DataFrame, + df: pd.DataFrame, x_cols: list[str], y_col: str = "decoy", group_columns: list[str] | None = None, @@ -192,7 +192,7 @@ def fit_predict( Parameters ---------- - df_ : pd.DataFrame + df : pd.DataFrame The input DataFrame from which predictions are to be made. x_cols : list[str] List of column names representing the features to be used for prediction. @@ -207,14 +207,12 @@ def fit_predict( A DataFrame containing only the predicted precursors. """ - df = df_.copy() df.dropna(subset=x_cols, inplace=True) df = apply_absolute_transformations(df) if self.first_classifier.fitted: X = df[x_cols].to_numpy() df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_subset = get_entries_below_fdr( df, self.first_fdr_cutoff, group_columns, remove_decoys=False ) @@ -222,72 +220,76 @@ def fit_predict( self.second_classifier.batch_size = 500 self.second_classifier.epochs = 50 - self.second_classifier.fit( - df_subset[x_cols].to_numpy(), df_subset[y_col].to_numpy() - ) - df_subset["proba"] = self.second_classifier.predict_proba( - df_subset[x_cols].to_numpy() - )[:, 1] + df_train = df_subset + df_predict = df_subset else: df_train = df[df["rank"] < self.train_on_top_n] - self.second_classifier.fit( - df_train[x_cols].to_numpy().astype(np.float32), - df_train[y_col].to_numpy().astype(np.float32), - ) + df_predict = df - df_subset = df - x_subset = df_subset[x_cols].to_numpy() - df_subset["proba"] = self.second_classifier.predict_proba(x_subset)[:, 1] + self.second_classifier.fit( + df_train[x_cols].to_numpy().astype(np.float32), + df_train[y_col].to_numpy().astype(np.float32), + ) - df_subset = get_entries_below_fdr( - df_subset, self.second_fdr_cutoff, group_columns, remove_decoys=False + df_predict["proba"] = self.second_classifier.predict_proba( + df_predict[x_cols].to_numpy() + )[:, 1] + df_predict = get_entries_below_fdr( + df_predict, self.second_fdr_cutoff, group_columns, remove_decoys=False ) - # print( - # f"After q-val extraction, after NN (second_fdr={self.second_fdr_cutoff}): {df_subset.shape}" - # ) - df_targets = df_subset[ - df_subset["decoy"] == 0 - ] # TODO df_targets does not have q values now. - - df_subset_2 = get_target_decoy_partners(df_subset, df) - self._update_classifier( - self.first_classifier, - df_subset_2, - x_cols, - y_col, - self.first_fdr_cutoff, - group_columns, + df_targets = df_predict[df_predict["decoy"] == 0] + + self.update_first_classifier( + df=get_target_decoy_partners(df_predict, df), + x_cols=x_cols, + y_col=y_col, + group_columns=group_columns, ) return df_targets - @classmethod - def _update_classifier( - cls, classifier, df, x_cols, y_col, fdr, group_columns + def update_first_classifier( + self, + df: pd.DataFrame, + x_cols: list[str], + y_col: str, + group_columns: list[str], ) -> None: + """ + Update the first classifier only if it improves upon the previous version or if it has not been previously fitted. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing the features and target. + x_cols : list[str] + List of column names representing the features. + y_col : str + Name of the column representing the target variable. + group_columns : list[str] + Columns used to group data for FDR calculation. + + """ X = df[x_cols].to_numpy() y = df[y_col].to_numpy() - if hasattr(classifier, "fitted") and classifier.fitted: - df["proba"] = classifier.predict_proba(X)[:, 1] - df_targets = get_entries_below_fdr(df, fdr, group_columns) - previous_n_precursors = len(df_targets) - saved_state_dict = classifier.to_state_dict() - else: - previous_n_precursors = -1 - classifier.fit(X, y) - classifier._fitted = True + previous_n_precursors = -1 - df["proba"] = classifier.predict_proba(X)[:, 1] - df_targets = get_entries_below_fdr(df, fdr, group_columns) + if self.first_classifier.fitted: + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) + previous_n_precursors = len(df_targets) + previous_state_dict = self.first_classifier.to_state_dict() + + self.first_classifier.fit(X, y) + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) current_n_precursors = len(df_targets) + if previous_n_precursors > current_n_precursors: - print( - "Reverting linear classifier to the previous version, as the new one performed worse." - ) - classifier.from_state_dict(saved_state_dict) + self.first_classifier.from_state_dict(previous_state_dict) @property def fitted(self) -> bool: From 2d2be2ad2b6ac35e2a2fd6959c75a9729c69d65d Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 14:01:34 +0100 Subject: [PATCH 13/24] formatting --- alphadia/workflow/peptidecentric.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 70c8e875..fe1be321 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -97,13 +97,13 @@ def get_classifier_base(enable_two_step_classifier: bool = False): nn_classifier = fdrx.BinaryClassifierLegacyNewBatching( - test_size=0.001, - batch_size=5000, - learning_rate=0.001, + test_size=0.001, + batch_size=5000, + learning_rate=0.001, epochs=10, experimental_hyperparameter_tuning=True, ) - + if enable_two_step_classifier: return fdrx.TwoStepClassifier( first_classifier=fdrx.LogisticRegressionClassifier(), From 7a1f4a74519290315bdbd1ddf66adbf3eb23de2c Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 15:05:03 +0100 Subject: [PATCH 14/24] move models to new fdr_analysis module --- alphadia/fdr_analysis/models/__init__.py | 4 + .../models/logistic_regression.py | 128 ++++++ .../models/two_step_classifier.py | 288 +++++++++++++ alphadia/fdrexperimental.py | 405 ------------------ alphadia/workflow/manager.py | 4 +- alphadia/workflow/peptidecentric.py | 5 +- 6 files changed, 425 insertions(+), 409 deletions(-) create mode 100644 alphadia/fdr_analysis/models/__init__.py create mode 100644 alphadia/fdr_analysis/models/logistic_regression.py create mode 100644 alphadia/fdr_analysis/models/two_step_classifier.py diff --git a/alphadia/fdr_analysis/models/__init__.py b/alphadia/fdr_analysis/models/__init__.py new file mode 100644 index 00000000..1e0053df --- /dev/null +++ b/alphadia/fdr_analysis/models/__init__.py @@ -0,0 +1,4 @@ +from .logistic_regression import LogisticRegressionClassifier +from .two_step_classifier import TwoStepClassifier + +__all__ = ["LogisticRegressionClassifier", "TwoStepClassifier"] diff --git a/alphadia/fdr_analysis/models/logistic_regression.py b/alphadia/fdr_analysis/models/logistic_regression.py new file mode 100644 index 00000000..48076622 --- /dev/null +++ b/alphadia/fdr_analysis/models/logistic_regression.py @@ -0,0 +1,128 @@ +import logging + +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler + +from alphadia.fdrexperimental import Classifier + +logger = logging.getLogger() + + +class LogisticRegressionClassifier(Classifier): + def __init__(self) -> None: + """Binary classifier using a logistic regression model.""" + self.scaler = StandardScaler() + self.model = LogisticRegression() + self._fitted = False + + @property + def fitted(self) -> bool: + return self._fitted + + def fit(self, x: np.ndarray, y: np.ndarray) -> None: + """Fit the classifier to the data. + + Parameters + ---------- + + x : np.array, dtype=float + Training data of shape (n_samples, n_features). + + y : np.array, dtype=int + Target values of shape (n_samples,) or (n_samples, n_classes). + + """ + x_scaled = self.scaler.fit_transform(x) + self.model.fit(x_scaled, y) + self._fitted = True + + def predict(self, x: np.ndarray) -> np.ndarray: + """Predict the class of the data. + + Parameters + ---------- + + x : np.array, dtype=float + Data of shape (n_samples, n_features). + + Returns + ------- + + y : np.array, dtype=float + Predicted class probabilities of shape (n_samples, n_classes). + + """ + x_scaled = self.scaler.transform(x) + return self.model.predict(x_scaled) + + def predict_proba(self, x: np.ndarray) -> np.ndarray: + """Predict the class probabilities of the data. + + Parameters + ---------- + + x : np.array, dtype=float + Data of shape (n_samples, n_features). + + Returns + ------- + + y : np.array, dtype=float + Predicted class probabilities of shape (n_samples, n_classes). + + """ + x_scaled = self.scaler.transform(x) + return self.model.predict_proba(x_scaled) + + def to_state_dict(self) -> dict: + """Return the state of the classifier as a dictionary. + + Returns + ------- + + dict : dict + Dictionary containing the state of the classifier. + + """ + state_dict = {"_fitted": self._fitted} + + if self._fitted: + state_dict.update( + { + "scaler_mean": self.scaler.mean_, + "scaler_var": self.scaler.var_, + "scaler_scale": self.scaler.scale_, + "scaler_n_samples_seen": self.scaler.n_samples_seen_, + "model_coef": self.model.coef_, + "model_intercept": self.model.intercept_, + "model_classes": self.model.classes_, + "is_fitted": self._fitted, + } + ) + + return state_dict + + def from_state_dict(self, state_dict: dict) -> None: + """Load the state of the classifier from a dictionary. + + Parameters + ---------- + + dict : dict + Dictionary containing the state of the classifier. + + """ + self._fitted = state_dict["_fitted"] + + if self.fitted: + self.scaler = StandardScaler() + self.scaler.mean_ = np.array(state_dict["scaler_mean"]) + self.scaler.var_ = np.array(state_dict["scaler_var"]) + self.scaler.scale_ = np.array(state_dict["scaler_scale"]) + self.scaler.n_samples_seen_ = np.array(state_dict["scaler_n_samples_seen"]) + + self.model = LogisticRegression() + self.model.coef_ = np.array(state_dict["model_coef"]) + self.model.intercept_ = np.array(state_dict["model_intercept"]) + self.model.classes_ = np.array(state_dict["model_classes"]) diff --git a/alphadia/fdr_analysis/models/two_step_classifier.py b/alphadia/fdr_analysis/models/two_step_classifier.py new file mode 100644 index 00000000..62e8141f --- /dev/null +++ b/alphadia/fdr_analysis/models/two_step_classifier.py @@ -0,0 +1,288 @@ +import logging + +import numpy as np +import pandas as pd + +from alphadia.fdr import get_q_values, keep_best +from alphadia.fdrexperimental import Classifier + +logger = logging.getLogger() + + +class TwoStepClassifier: + def __init__( + self, + first_classifier: Classifier, + second_classifier: Classifier, + train_on_top_n: int = 1, + first_fdr_cutoff: float = 0.6, + second_fdr_cutoff: float = 0.01, + ): + """ + A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage. + + Parameters + ---------- + first_classifier : Classifier + The first classifier used to initially filter the data. + second_classifier : Classifier + The second classifier used to further refine or confirm the classification based on the output from the first classifier. + train_on_top_n : int, default=1 + The number of top candidates that are considered for training of the second classifier. + first_fdr_cutoff : float, default=0.6 + The fdr threshold for the first classifier, determining how selective the first classification step is. + second_fdr_cutoff : float, default=0.01 + The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results. + + """ + self.first_classifier = first_classifier + self.second_classifier = second_classifier + self.first_fdr_cutoff = first_fdr_cutoff + self.second_fdr_cutoff = second_fdr_cutoff + + self.train_on_top_n = train_on_top_n + + def fit_predict( + self, + df: pd.DataFrame, + x_cols: list[str], + y_col: str = "decoy", + group_columns: list[str] | None = None, + ) -> pd.DataFrame: + """ + Train the two-step classifier and predict resulting precursors, returning a DataFrame of only the predicted precursors. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame from which predictions are to be made. + x_cols : list[str] + List of column names representing the features to be used for prediction. + y_col : str, optional + The name of the column that denotes the target variable, by default 'decoy'. + group_columns : list[str] | None, optional + List of column names to group by for fdr calculations;. If None, fdr calculations will not be grouped. + + Returns + ------- + pd.DataFrame + A DataFrame containing only the predicted precursors. + + """ + df.dropna(subset=x_cols, inplace=True) + df = apply_absolute_transformations(df) + + if self.first_classifier.fitted: + X = df[x_cols].to_numpy() + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_subset = get_entries_below_fdr( + df, self.first_fdr_cutoff, group_columns, remove_decoys=False + ) + + self.second_classifier.epochs = 50 + + df_train = df_subset + df_predict = df_subset + + else: + df_train = df[df["rank"] < self.train_on_top_n] + df_predict = df + + self.second_classifier.fit( + df_train[x_cols].to_numpy().astype(np.float32), + df_train[y_col].to_numpy().astype(np.float32), + ) + X = df_predict[x_cols].to_numpy() + df_predict["proba"] = self.second_classifier.predict_proba(X)[:, 1] + df_predict = get_entries_below_fdr( + df_predict, self.second_fdr_cutoff, group_columns, remove_decoys=False + ) + + df_targets = df_predict[df_predict["decoy"] == 0] + + self.update_first_classifier( + df=get_target_decoy_partners(df_predict, df), + x_cols=x_cols, + y_col=y_col, + group_columns=group_columns, + ) + + return df_targets + + def update_first_classifier( + self, + df: pd.DataFrame, + x_cols: list[str], + y_col: str, + group_columns: list[str], + ) -> None: + """ + Update the first classifier only if it improves upon the previous version or if it has not been previously fitted. + + Parameters + ---------- + df : pd.DataFrame + DataFrame containing the features and target. + x_cols : list[str] + List of column names representing the features. + y_col : str + Name of the column representing the target variable. + group_columns : list[str] + Columns used to group data for FDR calculation. + + """ + X = df[x_cols].to_numpy() + y = df[y_col].to_numpy() + + previous_n_precursors = -1 + + if self.first_classifier.fitted: + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) + previous_n_precursors = len(df_targets) + previous_state_dict = self.first_classifier.to_state_dict() + + self.first_classifier.fit(X, y) + + df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) + current_n_precursors = len(df_targets) + + if previous_n_precursors > current_n_precursors: + self.first_classifier.from_state_dict(previous_state_dict) + + @property + def fitted(self) -> bool: + """Return whether both classifiers have been fitted.""" + return self.second_classifier.fitted + + def to_state_dict(self) -> dict: + """Save classifier state. + + Returns + ------- + dict + State dictionary containing both classifiers + """ + return { + "first_classifier": self.first_classifier.to_state_dict(), + "second_classifier": self.second_classifier.to_state_dict(), + "first_fdr_cutoff": self.first_fdr_cutoff, + "second_fdr_cutoff": self.second_fdr_cutoff, + "train_on_top_n": self.train_on_top_n, + } + + def from_state_dict(self, state_dict: dict) -> None: + """Load classifier state. + + Parameters + ---------- + state_dict : dict + State dictionary containing both classifiers + """ + self.first_classifier.from_state_dict(state_dict["first_classifier"]) + self.second_classifier.from_state_dict(state_dict["second_classifier"]) + self.first_fdr_cutoff = state_dict["first_fdr_cutoff"] + self.second_fdr_cutoff = state_dict["second_fdr_cutoff"] + self.train_on_top_n = state_dict["train_on_top_n"] + + +def get_entries_below_fdr( + df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True +) -> pd.DataFrame: + """ + Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries. + If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing the columns 'proba', 'decoy', and any specified group columns. + fdr : float + The false discovery rate threshold for filtering entries. + group_columns : list + List of columns to group by when determining the best entries per group. + remove_decoys : bool, optional + Specifies whether decoy entries should be removed from the final result. Defaults to True. + + Returns + ------- + pd.DataFrame + A DataFrame containing entries below the specified FDR threshold, optionally excluding decoys. + """ + df.sort_values("proba", ascending=True, inplace=True) + df = keep_best(df, group_columns=group_columns) + df = get_q_values(df, "proba", "decoy") + + df_subset = df[df["qval"] < fdr] + if remove_decoys: + df_subset = df_subset[df_subset["decoy"] == 0] + + # Handle case where no entries are below the FDR threshold + if len(df_subset) == 0: + df = df[df["decoy"] == 0] + df_subset = df.loc[[df["qval"].idxmin()]] + + return df_subset + + +def get_target_decoy_partners( + reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None +) -> pd.DataFrame: + """ + Identifies and returns the corresponding target and decoy wartner rows in full_df given the subset reference_df/ + This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index. + + Parameters + ---------- + reference_df : pd.DataFrame + A subset DataFrame that contains reference values for matching. + full_df : pd.DataFrame + The main DataFrame from which rows will be matched against reference_df. + group_by : list[str] | None, optional + The columns to group by when performing the match. Defaults to ['rank', 'elution_group_idx'] if None is provided. + + Returns + ------- + pd.DataFrame + A DataFrame containing rows from full_df that match the grouping criteria. + + """ + if group_by is None: + group_by = ["rank", "elution_group_idx"] + valid_tuples = reference_df[group_by] + matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner") + + return matching_rows + + +def apply_absolute_transformations( + df: pd.DataFrame, columns: list[str] | None = None +) -> pd.DataFrame: + """ + Applies absolute value transformations to predefined columns in a DataFrame inplace. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing the data to be transformed. + columns : list of str, optional + List of column names to transform. Defaults to ['delta_rt', 'top_3_ms2_mass_error', 'mean_ms2_mass_error']. + + Returns + ------- + pd.DataFrame + The transformed DataFrame. + """ + if columns is None: + columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"] + + for col in columns: + if col in df.columns: + df[col] = np.abs(df[col]) + else: + logger.warning( + f"column '{col}' is not present in df, therefore abs() was not applied." + ) + + return df \ No newline at end of file diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index a0dbe8cb..07f22ce3 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -8,52 +8,15 @@ # alpha family imports # third party imports import numpy as np -import pandas as pd import torch from sklearn import model_selection -from sklearn.linear_model import LogisticRegression -from sklearn.preprocessing import StandardScaler from torch import nn, optim from torchmetrics.classification import BinaryAUROC from tqdm import tqdm -from alphadia.fdr import get_q_values, keep_best - logger = logging.getLogger() -def apply_absolute_transformations( - df: pd.DataFrame, columns: list[str] | None = None -) -> pd.DataFrame: - """ - Applies absolute value transformations to predefined columns in a DataFrame inplace. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing the data to be transformed. - columns : list of str, optional - List of column names to transform. Defaults to ['delta_rt', 'top_3_ms2_mass_error', 'mean_ms2_mass_error']. - - Returns - ------- - pd.DataFrame - The transformed DataFrame. - """ - if columns is None: - columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"] - - for col in columns: - if col in df.columns: - df[col] = np.abs(df[col]) - else: - logger.warning( - f"column '{col}' is not present in df, therefore abs() was not applied." - ) - - return df - - class Classifier(ABC): """Abstract base class for classifiers. @@ -147,374 +110,6 @@ def from_state_dict(self, state_dict: dict): """ -class TwoStepClassifier: - def __init__( - self, - first_classifier: Classifier, - second_classifier: Classifier, - train_on_top_n: int = 1, - first_fdr_cutoff: float = 0.6, - second_fdr_cutoff: float = 0.01, - ): - """ - A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage. - - Parameters - ---------- - first_classifier : Classifier - The first classifier used to initially filter the data. - second_classifier : Classifier - The second classifier used to further refine or confirm the classification based on the output from the first classifier. - train_on_top_n : int, default=1 - The number of top candidates that are considered for training of the second classifier. - first_fdr_cutoff : float, default=0.6 - The fdr threshold for the first classifier, determining how selective the first classification step is. - second_fdr_cutoff : float, default=0.01 - The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results. - - """ - self.first_classifier = first_classifier - self.second_classifier = second_classifier - self.first_fdr_cutoff = first_fdr_cutoff - self.second_fdr_cutoff = second_fdr_cutoff - - self.train_on_top_n = train_on_top_n - - def fit_predict( - self, - df: pd.DataFrame, - x_cols: list[str], - y_col: str = "decoy", - group_columns: list[str] | None = None, - ) -> pd.DataFrame: - """ - Train the two-step classifier and predict resulting precursors, returning a DataFrame of only the predicted precursors. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame from which predictions are to be made. - x_cols : list[str] - List of column names representing the features to be used for prediction. - y_col : str, optional - The name of the column that denotes the target variable, by default 'decoy'. - group_columns : list[str] | None, optional - List of column names to group by for fdr calculations;. If None, fdr calculations will not be grouped. - - Returns - ------- - pd.DataFrame - A DataFrame containing only the predicted precursors. - - """ - df.dropna(subset=x_cols, inplace=True) - df = apply_absolute_transformations(df) - - if self.first_classifier.fitted: - X = df[x_cols].to_numpy() - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_subset = get_entries_below_fdr( - df, self.first_fdr_cutoff, group_columns, remove_decoys=False - ) - - self.second_classifier.batch_size = 500 - self.second_classifier.epochs = 50 - - df_train = df_subset - df_predict = df_subset - - else: - df_train = df[df["rank"] < self.train_on_top_n] - df_predict = df - - self.second_classifier.fit( - df_train[x_cols].to_numpy().astype(np.float32), - df_train[y_col].to_numpy().astype(np.float32), - ) - - df_predict["proba"] = self.second_classifier.predict_proba( - df_predict[x_cols].to_numpy() - )[:, 1] - df_predict = get_entries_below_fdr( - df_predict, self.second_fdr_cutoff, group_columns, remove_decoys=False - ) - df_targets = df_predict[df_predict["decoy"] == 0] - - self.update_first_classifier( - df=get_target_decoy_partners(df_predict, df), - x_cols=x_cols, - y_col=y_col, - group_columns=group_columns, - ) - - return df_targets - - def update_first_classifier( - self, - df: pd.DataFrame, - x_cols: list[str], - y_col: str, - group_columns: list[str], - ) -> None: - """ - Update the first classifier only if it improves upon the previous version or if it has not been previously fitted. - - Parameters - ---------- - df : pd.DataFrame - DataFrame containing the features and target. - x_cols : list[str] - List of column names representing the features. - y_col : str - Name of the column representing the target variable. - group_columns : list[str] - Columns used to group data for FDR calculation. - - """ - X = df[x_cols].to_numpy() - y = df[y_col].to_numpy() - - previous_n_precursors = -1 - - if self.first_classifier.fitted: - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) - previous_n_precursors = len(df_targets) - previous_state_dict = self.first_classifier.to_state_dict() - - self.first_classifier.fit(X, y) - - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) - current_n_precursors = len(df_targets) - - if previous_n_precursors > current_n_precursors: - self.first_classifier.from_state_dict(previous_state_dict) - - @property - def fitted(self) -> bool: - """Return whether both classifiers have been fitted.""" - return self.second_classifier.fitted - - def to_state_dict(self) -> dict: - """Save classifier state. - - Returns - ------- - dict - State dictionary containing both classifiers - """ - return { - "first_classifier": self.first_classifier.to_state_dict(), - "second_classifier": self.second_classifier.to_state_dict(), - "first_fdr_cutoff": self.first_fdr_cutoff, - "second_fdr_cutoff": self.second_fdr_cutoff, - "train_on_top_n": self.train_on_top_n, - } - - def from_state_dict(self, state_dict: dict) -> None: - """Load classifier state. - - Parameters - ---------- - state_dict : dict - State dictionary containing both classifiers - """ - self.first_classifier.from_state_dict(state_dict["first_classifier"]) - self.second_classifier.from_state_dict(state_dict["second_classifier"]) - self.first_fdr_cutoff = state_dict["first_fdr_cutoff"] - self.second_fdr_cutoff = state_dict["second_fdr_cutoff"] - self.train_on_top_n = state_dict["train_on_top_n"] - - -def get_entries_below_fdr( - df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True -) -> pd.DataFrame: - """ - Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries. - If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing the columns 'proba', 'decoy', and any specified group columns. - fdr : float - The false discovery rate threshold for filtering entries. - group_columns : list - List of columns to group by when determining the best entries per group. - remove_decoys : bool, optional - Specifies whether decoy entries should be removed from the final result. Defaults to True. - - Returns - ------- - pd.DataFrame - A DataFrame containing entries below the specified FDR threshold, optionally excluding decoys. - """ - df.sort_values("proba", ascending=True, inplace=True) - df = keep_best(df, group_columns=group_columns) - df = get_q_values(df, "proba", "decoy") - - df_subset = df[df["qval"] < fdr] - if remove_decoys: - df_subset = df_subset[df_subset["decoy"] == 0] - - # Handle case where no entries are below the FDR threshold - if len(df_subset) == 0: - df = df[df["decoy"] == 0] - df_subset = df.loc[[df["qval"].idxmin()]] - - return df_subset - - -def get_target_decoy_partners( - reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None -) -> pd.DataFrame: - """ - Identifies and returns the corresponding target and decoy wartner rows in full_df given the subset reference_df/ - This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index. - - Parameters - ---------- - reference_df : pd.DataFrame - A subset DataFrame that contains reference values for matching. - full_df : pd.DataFrame - The main DataFrame from which rows will be matched against reference_df. - group_by : list[str] | None, optional - The columns to group by when performing the match. Defaults to ['rank', 'elution_group_idx'] if None is provided. - - Returns - ------- - pd.DataFrame - A DataFrame containing rows from full_df that match the grouping criteria. - - """ - if group_by is None: - group_by = ["rank", "elution_group_idx"] - valid_tuples = reference_df[group_by] - matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner") - - return matching_rows - - -class LogisticRegressionClassifier(Classifier): - def __init__(self) -> None: - """Binary classifier using a logistic regression model.""" - self.scaler = StandardScaler() - self.model = LogisticRegression() - self._fitted = False - - @property - def fitted(self) -> bool: - return self._fitted - - def fit(self, x: np.ndarray, y: np.ndarray) -> None: - """Fit the classifier to the data. - - Parameters - ---------- - - x : np.array, dtype=float - Training data of shape (n_samples, n_features). - - y : np.array, dtype=int - Target values of shape (n_samples,) or (n_samples, n_classes). - - """ - x_scaled = self.scaler.fit_transform(x) - self.model.fit(x_scaled, y) - self._fitted = True - - def predict(self, x: np.ndarray) -> np.ndarray: - """Predict the class of the data. - - Parameters - ---------- - - x : np.array, dtype=float - Data of shape (n_samples, n_features). - - Returns - ------- - - y : np.array, dtype=float - Predicted class probabilities of shape (n_samples, n_classes). - - """ - x_scaled = self.scaler.transform(x) - return self.model.predict(x_scaled) - - def predict_proba(self, x: np.ndarray) -> np.ndarray: - """Predict the class probabilities of the data. - - Parameters - ---------- - - x : np.array, dtype=float - Data of shape (n_samples, n_features). - - Returns - ------- - - y : np.array, dtype=float - Predicted class probabilities of shape (n_samples, n_classes). - - """ - x_scaled = self.scaler.transform(x) - return self.model.predict_proba(x_scaled) - - def to_state_dict(self) -> dict: - """Return the state of the classifier as a dictionary. - - Returns - ------- - - dict : dict - Dictionary containing the state of the classifier. - - """ - state_dict = {"_fitted": self._fitted} - - if self._fitted: - state_dict.update( - { - "scaler_mean": self.scaler.mean_, - "scaler_var": self.scaler.var_, - "scaler_scale": self.scaler.scale_, - "scaler_n_samples_seen": self.scaler.n_samples_seen_, - "model_coef": self.model.coef_, - "model_intercept": self.model.intercept_, - "model_classes": self.model.classes_, - "is_fitted": self._fitted, - } - ) - - return state_dict - - def from_state_dict(self, state_dict: dict) -> None: - """Load the state of the classifier from a dictionary. - - Parameters - ---------- - - dict : dict - Dictionary containing the state of the classifier. - - """ - self._fitted = state_dict["_fitted"] - - if self.fitted: - self.scaler = StandardScaler() - self.scaler.mean_ = np.array(state_dict["scaler_mean"]) - self.scaler.var_ = np.array(state_dict["scaler_var"]) - self.scaler.scale_ = np.array(state_dict["scaler_scale"]) - self.scaler.n_samples_seen_ = np.array(state_dict["scaler_n_samples_seen"]) - - self.model = LogisticRegression() - self.model.coef_ = np.array(state_dict["model_coef"]) - self.model.intercept_ = np.array(state_dict["model_intercept"]) - self.model.classes_ = np.array(state_dict["model_classes"]) - - class BinaryClassifier(Classifier): def __init__( self, diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 197713fe..78dc5160 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -18,9 +18,9 @@ # alphadia imports import alphadia -import alphadia.fdrexperimental as fdrx from alphadia import fdr from alphadia.calibration.property import Calibration, calibration_model_provider +from alphadia.fdr_analysis.models import TwoStepClassifier from alphadia.workflow import reporting from alphadia.workflow.config import Config @@ -629,7 +629,7 @@ def __init__( self.classifier_store = defaultdict(list) self.classifier_base = classifier_base self.enable_two_step_classifier = isinstance( - classifier_base, fdrx.TwoStepClassifier + classifier_base, TwoStepClassifier ) self._current_version = -1 diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index fe1be321..ffcb4eaf 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -15,6 +15,7 @@ # alphadia imports from alphadia import fragcomp, plexscoring, utils +from alphadia.fdr_analysis.models import LogisticRegressionClassifier, TwoStepClassifier from alphadia.peakgroup import search from alphadia.workflow import base, manager, optimization from alphadia.workflow.config import Config @@ -105,8 +106,8 @@ def get_classifier_base(enable_two_step_classifier: bool = False): ) if enable_two_step_classifier: - return fdrx.TwoStepClassifier( - first_classifier=fdrx.LogisticRegressionClassifier(), + return TwoStepClassifier( + first_classifier=LogisticRegressionClassifier(), second_classifier=nn_classifier, ) else: From ceabe7c38feaa86d66e60ec4a3ab935a335427d5 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 17 Jan 2025 15:39:26 +0100 Subject: [PATCH 15/24] move files from fdr_analysis to fdrx module --- alphadia/fdrexperimental.py | 1 - alphadia/{fdr_analysis => fdrx}/models/__init__.py | 0 alphadia/{fdr_analysis => fdrx}/models/logistic_regression.py | 0 alphadia/{fdr_analysis => fdrx}/models/two_step_classifier.py | 2 +- alphadia/workflow/manager.py | 2 +- alphadia/workflow/peptidecentric.py | 2 +- 6 files changed, 3 insertions(+), 4 deletions(-) rename alphadia/{fdr_analysis => fdrx}/models/__init__.py (100%) rename alphadia/{fdr_analysis => fdrx}/models/logistic_regression.py (100%) rename alphadia/{fdr_analysis => fdrx}/models/two_step_classifier.py (99%) diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py index 07f22ce3..af9faa01 100644 --- a/alphadia/fdrexperimental.py +++ b/alphadia/fdrexperimental.py @@ -1179,7 +1179,6 @@ def fit(self, x: np.ndarray, y: np.ndarray): x_train_batch = x_train[batch_start:batch_stop] y_train_batch = y_train[batch_start:batch_stop] y_pred = self.network(x_train_batch) - loss_value = loss(y_pred, y_train_batch) self.network.zero_grad() diff --git a/alphadia/fdr_analysis/models/__init__.py b/alphadia/fdrx/models/__init__.py similarity index 100% rename from alphadia/fdr_analysis/models/__init__.py rename to alphadia/fdrx/models/__init__.py diff --git a/alphadia/fdr_analysis/models/logistic_regression.py b/alphadia/fdrx/models/logistic_regression.py similarity index 100% rename from alphadia/fdr_analysis/models/logistic_regression.py rename to alphadia/fdrx/models/logistic_regression.py diff --git a/alphadia/fdr_analysis/models/two_step_classifier.py b/alphadia/fdrx/models/two_step_classifier.py similarity index 99% rename from alphadia/fdr_analysis/models/two_step_classifier.py rename to alphadia/fdrx/models/two_step_classifier.py index 62e8141f..3f0c4290 100644 --- a/alphadia/fdr_analysis/models/two_step_classifier.py +++ b/alphadia/fdrx/models/two_step_classifier.py @@ -285,4 +285,4 @@ def apply_absolute_transformations( f"column '{col}' is not present in df, therefore abs() was not applied." ) - return df \ No newline at end of file + return df diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 78dc5160..07625d13 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -20,7 +20,7 @@ import alphadia from alphadia import fdr from alphadia.calibration.property import Calibration, calibration_model_provider -from alphadia.fdr_analysis.models import TwoStepClassifier +from alphadia.fdrx.models import TwoStepClassifier from alphadia.workflow import reporting from alphadia.workflow.config import Config diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index ffcb4eaf..a86e2958 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -15,7 +15,7 @@ # alphadia imports from alphadia import fragcomp, plexscoring, utils -from alphadia.fdr_analysis.models import LogisticRegressionClassifier, TwoStepClassifier +from alphadia.fdrx.models import LogisticRegressionClassifier, TwoStepClassifier from alphadia.peakgroup import search from alphadia.workflow import base, manager, optimization from alphadia.workflow.config import Config From fdf62db51581c0195b414f2856cd3ad92e1db83a Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Mon, 20 Jan 2025 17:45:39 +0100 Subject: [PATCH 16/24] add max_iteration parameter to 2-step-classifier.fit_predict() --- alphadia/fdrx/models/two_step_classifier.py | 172 ++++++++++++++------ alphadia/workflow/manager.py | 2 +- alphadia/workflow/peptidecentric.py | 24 ++- 3 files changed, 143 insertions(+), 55 deletions(-) diff --git a/alphadia/fdrx/models/two_step_classifier.py b/alphadia/fdrx/models/two_step_classifier.py index 3f0c4290..0c2341a1 100644 --- a/alphadia/fdrx/models/two_step_classifier.py +++ b/alphadia/fdrx/models/two_step_classifier.py @@ -14,9 +14,10 @@ def __init__( self, first_classifier: Classifier, second_classifier: Classifier, - train_on_top_n: int = 1, first_fdr_cutoff: float = 0.6, second_fdr_cutoff: float = 0.01, + min_precursors_for_update: int = 5000, + train_on_top_n: int = 1, ): """ A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage. @@ -27,12 +28,12 @@ def __init__( The first classifier used to initially filter the data. second_classifier : Classifier The second classifier used to further refine or confirm the classification based on the output from the first classifier. - train_on_top_n : int, default=1 - The number of top candidates that are considered for training of the second classifier. first_fdr_cutoff : float, default=0.6 The fdr threshold for the first classifier, determining how selective the first classification step is. second_fdr_cutoff : float, default=0.01 The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results. + min_precursors_for_update : int, default=5000 + The minimum number of precursors required to update the first classifier. """ self.first_classifier = first_classifier @@ -40,6 +41,7 @@ def __init__( self.first_fdr_cutoff = first_fdr_cutoff self.second_fdr_cutoff = second_fdr_cutoff + self.min_precursors_for_update = min_precursors_for_update self.train_on_top_n = train_on_top_n def fit_predict( @@ -48,89 +50,133 @@ def fit_predict( x_cols: list[str], y_col: str = "decoy", group_columns: list[str] | None = None, + max_iterations: int = 5, ) -> pd.DataFrame: """ - Train the two-step classifier and predict resulting precursors, returning a DataFrame of only the predicted precursors. + Train the two-step classifier and predict precursors using an iterative approach: + 1. First iteration: Train neural network on top-n candidates. + 2. Update linear classifier if enough high-confidence predictions are found, else break. + 3. Subsequent iterations: Use linear classifier to filter data, then refine with neural network. Parameters ---------- df : pd.DataFrame - The input DataFrame from which predictions are to be made. + Input DataFrame containing features and target variable x_cols : list[str] - List of column names representing the features to be used for prediction. + Feature column names y_col : str, optional - The name of the column that denotes the target variable, by default 'decoy'. + Target variable column name, defaults to 'decoy' group_columns : list[str] | None, optional - List of column names to group by for fdr calculations;. If None, fdr calculations will not be grouped. + Columns to group by for FDR calculations + max_iterations : int + Maximum number of refinement iterations Returns ------- pd.DataFrame - A DataFrame containing only the predicted precursors. + DataFrame containing predictions and q-values """ - df.dropna(subset=x_cols, inplace=True) - df = apply_absolute_transformations(df) - - if self.first_classifier.fitted: - X = df[x_cols].to_numpy() - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_subset = get_entries_below_fdr( - df, self.first_fdr_cutoff, group_columns, remove_decoys=False + df = self.preprocess_data(df, x_cols) + best_result = None + best_precursor_count = -1 + + for i in range(max_iterations): + if self.first_classifier.fitted and i > 0: + df_train, df_predict = self.apply_filtering_with_first_classifier( + df, x_cols, group_columns + ) + self.second_classifier.epochs = 50 + else: + df_train = df[df["rank"] < self.train_on_top_n] + df_predict = df + self.second_classifier.epochs = 10 + + predictions = self.train_and_apply_second_classifier( + df_train, df_predict, x_cols, y_col, group_columns ) - self.second_classifier.epochs = 50 + # Filter results and check for improvement + df_filtered = filter_for_qval(predictions, self.second_fdr_cutoff) + current_target_count = len(df_filtered[df_filtered["decoy"] == 0]) - df_train = df_subset - df_predict = df_subset + if current_target_count < best_precursor_count: + logger.info( + f"Stop training after iteration {i}, " + f"due to decreasing target count ({current_target_count} < {best_precursor_count})" + ) + return best_result - else: - df_train = df[df["rank"] < self.train_on_top_n] - df_predict = df + best_precursor_count = current_target_count + best_result = predictions - self.second_classifier.fit( - df_train[x_cols].to_numpy().astype(np.float32), - df_train[y_col].to_numpy().astype(np.float32), - ) - X = df_predict[x_cols].to_numpy() - df_predict["proba"] = self.second_classifier.predict_proba(X)[:, 1] - df_predict = get_entries_below_fdr( - df_predict, self.second_fdr_cutoff, group_columns, remove_decoys=False + # Update first classifier if enough confident predictions + if current_target_count > self.min_precursors_for_update: + self.update_first_classifier( + df_filtered, df, x_cols, y_col, group_columns + ) + else: + break + + return best_result + + def preprocess_data(self, df: pd.DataFrame, x_cols: list[str]) -> pd.DataFrame: + """ + Prepare data by removing NaN values and applying absolute transformations. + """ + df.dropna(subset=x_cols, inplace=True) + return apply_absolute_transformations(df) + + def apply_filtering_with_first_classifier( + self, df: pd.DataFrame, x_cols: list[str], group_columns: list[str] + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """ + Apply first classifier to filter data for the training of the second classifier. + """ + df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1] + + filtered_df = get_entries_below_fdr( + df, self.first_fdr_cutoff, group_columns, remove_decoys=False ) - df_targets = df_predict[df_predict["decoy"] == 0] + return filtered_df, filtered_df - self.update_first_classifier( - df=get_target_decoy_partners(df_predict, df), - x_cols=x_cols, - y_col=y_col, - group_columns=group_columns, + def train_and_apply_second_classifier( + self, + train_df: pd.DataFrame, + predict_df: pd.DataFrame, + x_cols: list[str], + y_col: str, + group_columns: list[str], + ) -> pd.DataFrame: + """ + Train second_classifeir and apply it to get predictions. + """ + self.second_classifier.fit( + train_df[x_cols].to_numpy().astype(np.float32), + train_df[y_col].to_numpy().astype(np.float32), ) - return df_targets + X = predict_df[x_cols].to_numpy() + predict_df["proba"] = self.second_classifier.predict_proba(X)[:, 1] + + return compute_q_values(predict_df, group_columns) def update_first_classifier( self, - df: pd.DataFrame, + subset_df: pd.DataFrame, + full_df: pd.DataFrame, x_cols: list[str], y_col: str, group_columns: list[str], ) -> None: """ - Update the first classifier only if it improves upon the previous version or if it has not been previously fitted. - - Parameters - ---------- - df : pd.DataFrame - DataFrame containing the features and target. - x_cols : list[str] - List of column names representing the features. - y_col : str - Name of the column representing the target variable. - group_columns : list[str] - Columns used to group data for FDR calculation. - + Update first classifier by finding and using target/decoy pairs. First extracts the corresponding + target/decoy partners from the full dataset for each entry in the subset, then uses these + pairs to retrain the classifier. """ + df = get_target_decoy_partners(subset_df, full_df) + X = df[x_cols].to_numpy() y = df[y_col].to_numpy() @@ -149,7 +195,13 @@ def update_first_classifier( current_n_precursors = len(df_targets) if previous_n_precursors > current_n_precursors: + logger.info( + f"Reverted the first classifier back to the previous version " + f"(prev: {previous_n_precursors}, curr: {current_n_precursors})" + ) self.first_classifier.from_state_dict(previous_state_dict) + else: + logger.info("Fitted the second classifier") @property def fitted(self) -> bool: @@ -187,6 +239,22 @@ def from_state_dict(self, state_dict: dict) -> None: self.train_on_top_n = state_dict["train_on_top_n"] +def compute_q_values(df: pd.DataFrame, group_columns: list[str]): + df.sort_values("proba", ascending=True, inplace=True) + df = keep_best(df, group_columns=group_columns) + return get_q_values(df, "proba", "decoy") + + +def filter_for_qval(df: pd.DataFrame, fdr_cutoff: float): + df_filtered = df[df["qval"] < fdr_cutoff] + + if len(df_filtered) == 0: + df_targets = df[df["decoy"] == 0] + df_filtered = df_targets.loc[[df_targets["qval"].idxmin()]] + + return df_filtered + + def get_entries_below_fdr( df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True ) -> pd.DataFrame: diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 07625d13..74eb7577 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -607,7 +607,6 @@ def __init__( classifier_base, path: None | str = None, load_from_file: bool = True, - enable_two_step_classifier: bool = False, **kwargs, ): """Contains, updates and applies classifiers for target-decoy competitio-based false discovery rate (FDR) estimation. @@ -767,6 +766,7 @@ def fit_predict( raise ValueError(f"Invalid decoy_strategy: {decoy_strategy}") self.is_fitted = True + # n_precursor = len(psm_df[psm_df["qval"] <= 0.01]) self._current_version += 1 self.classifier_store[column_hash(available_columns)].append(classifier) diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index a86e2958..7a793552 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -96,7 +96,25 @@ ] -def get_classifier_base(enable_two_step_classifier: bool = False): +def get_classifier_base( + enable_two_step_classifier: bool = False, fdr_cutoff: float = 0.01 +): + """Creates and returns a classifier base instance. + + Parameters + ---------- + enable_two_step_classifier : bool, optional + If True, uses logistic regression + neural network. + If False (default), uses only neural network. + fdr_cutoff : float, optional + The FDR cutoff threshold used by the second classifier when two-step + classification is enabled. Default is 0.01. + + Returns + ------- + BinaryClassifierLegacyNewBatching | TwoStepClassifier + Neural network or two-step classifier based on enable_two_step_classifier. + """ nn_classifier = fdrx.BinaryClassifierLegacyNewBatching( test_size=0.001, batch_size=5000, @@ -109,6 +127,7 @@ def get_classifier_base(enable_two_step_classifier: bool = False): return TwoStepClassifier( first_classifier=LogisticRegressionClassifier(), second_classifier=nn_classifier, + second_fdr_cutoff=fdr_cutoff, ) else: return nn_classifier @@ -149,7 +168,8 @@ def init_fdr_manager(self): self.fdr_manager = manager.FDRManager( feature_columns=feature_columns, classifier_base=get_classifier_base( - self.config["fdr"]["enable_two_step_classifier"] + self.config["fdr"]["enable_two_step_classifier"], + self.config["fdr"]["fdr"], ), ) From e6d6b95f0a829ddb3e68cfb4359f58fdb1e3c1fe Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 21 Jan 2025 11:14:24 +0100 Subject: [PATCH 17/24] refactoring of two-step-classifier helper functions --- alphadia/fdrx/models/two_step_classifier.py | 68 +++++++++------------ 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/alphadia/fdrx/models/two_step_classifier.py b/alphadia/fdrx/models/two_step_classifier.py index 0c2341a1..9551b005 100644 --- a/alphadia/fdrx/models/two_step_classifier.py +++ b/alphadia/fdrx/models/two_step_classifier.py @@ -55,8 +55,8 @@ def fit_predict( """ Train the two-step classifier and predict precursors using an iterative approach: 1. First iteration: Train neural network on top-n candidates. - 2. Update linear classifier if enough high-confidence predictions are found, else break. - 3. Subsequent iterations: Use linear classifier to filter data, then refine with neural network. + 2. Subsequent iterations: Use linear classifier to filter data, then refine with neural network. + 3. Update linear classifier if enough high-confidence predictions are found, else break. Parameters ---------- @@ -97,7 +97,7 @@ def fit_predict( ) # Filter results and check for improvement - df_filtered = filter_for_qval(predictions, self.second_fdr_cutoff) + df_filtered = filter_by_qval(predictions, self.second_fdr_cutoff) current_target_count = len(df_filtered[df_filtered["decoy"] == 0]) if current_target_count < best_precursor_count: @@ -135,7 +135,7 @@ def apply_filtering_with_first_classifier( """ df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1] - filtered_df = get_entries_below_fdr( + filtered_df = compute_and_filter_q_values( df, self.first_fdr_cutoff, group_columns, remove_decoys=False ) @@ -184,14 +184,18 @@ def update_first_classifier( if self.first_classifier.fitted: df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) + df_targets = compute_and_filter_q_values( + df, self.first_fdr_cutoff, group_columns + ) previous_n_precursors = len(df_targets) previous_state_dict = self.first_classifier.to_state_dict() self.first_classifier.fit(X, y) df["proba"] = self.first_classifier.predict_proba(X)[:, 1] - df_targets = get_entries_below_fdr(df, self.first_fdr_cutoff, group_columns) + df_targets = compute_and_filter_q_values( + df, self.first_fdr_cutoff, group_columns + ) current_n_precursors = len(df_targets) if previous_n_precursors > current_n_precursors: @@ -239,13 +243,22 @@ def from_state_dict(self, state_dict: dict) -> None: self.train_on_top_n = state_dict["train_on_top_n"] -def compute_q_values(df: pd.DataFrame, group_columns: list[str]): +def compute_q_values( + df: pd.DataFrame, group_columns: list[str] | None = None +) -> pd.DataFrame: + """ + Compute q-values for each entry after keeping only best entries per group. + """ df.sort_values("proba", ascending=True, inplace=True) df = keep_best(df, group_columns=group_columns) return get_q_values(df, "proba", "decoy") -def filter_for_qval(df: pd.DataFrame, fdr_cutoff: float): +def filter_by_qval(df: pd.DataFrame, fdr_cutoff: float) -> pd.DataFrame: + """ + Filter dataframe by q-value threshold. If no entries pass the threshold, + return the single target entry with lowest q-value. + """ df_filtered = df[df["qval"] < fdr_cutoff] if len(df_filtered) == 0: @@ -255,50 +268,27 @@ def filter_for_qval(df: pd.DataFrame, fdr_cutoff: float): return df_filtered -def get_entries_below_fdr( - df: pd.DataFrame, fdr: float, group_columns: list[str], remove_decoys: bool = True +def compute_and_filter_q_values( + df: pd.DataFrame, + fdr: float, + group_columns: list[str] | None = None, + remove_decoys: bool = True, ) -> pd.DataFrame: """ Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries. If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing the columns 'proba', 'decoy', and any specified group columns. - fdr : float - The false discovery rate threshold for filtering entries. - group_columns : list - List of columns to group by when determining the best entries per group. - remove_decoys : bool, optional - Specifies whether decoy entries should be removed from the final result. Defaults to True. - - Returns - ------- - pd.DataFrame - A DataFrame containing entries below the specified FDR threshold, optionally excluding decoys. """ - df.sort_values("proba", ascending=True, inplace=True) - df = keep_best(df, group_columns=group_columns) - df = get_q_values(df, "proba", "decoy") - - df_subset = df[df["qval"] < fdr] + df = compute_q_values(df, group_columns) if remove_decoys: - df_subset = df_subset[df_subset["decoy"] == 0] - - # Handle case where no entries are below the FDR threshold - if len(df_subset) == 0: df = df[df["decoy"] == 0] - df_subset = df.loc[[df["qval"].idxmin()]] - - return df_subset + return filter_by_qval(df, fdr) def get_target_decoy_partners( reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None ) -> pd.DataFrame: """ - Identifies and returns the corresponding target and decoy wartner rows in full_df given the subset reference_df/ + Identifies and returns the corresponding target and decoy partner rows in full_df given the subset reference_df. This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index. Parameters From 65a11bb3b3c657f290263d54e147afe6b96513ff Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 21 Jan 2025 11:15:57 +0100 Subject: [PATCH 18/24] add unit tests --- tests/unit_tests/test_fdrx_models.py | 88 ++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 tests/unit_tests/test_fdrx_models.py diff --git a/tests/unit_tests/test_fdrx_models.py b/tests/unit_tests/test_fdrx_models.py new file mode 100644 index 00000000..0ec48aa0 --- /dev/null +++ b/tests/unit_tests/test_fdrx_models.py @@ -0,0 +1,88 @@ +import pandas as pd +import pytest +from collections import Counter + +from alphadia.fdrx.models.two_step_classifier import apply_absolute_transformations, get_target_decoy_partners, compute_and_filter_q_values + + +def test_apply_absolute_transformations(): + data = { + 'delta_rt': [-1, -2, 3], + 'top_3_ms2_mass_error': [-1, -2, -3], + 'mean_ms2_mass_error': [1, -2, 3], + 'extra_column': [-1, -2, -3] + } + df = pd.DataFrame(data) + + transformed_df = apply_absolute_transformations(df) + + assert (transformed_df['delta_rt'] >= 0).all(), "delta_rt contains negative values" + assert (transformed_df['top_3_ms2_mass_error'] >= 0).all(), "top_3_ms2_mass_error contains negative values" + assert (transformed_df['mean_ms2_mass_error'] >= 0).all(), "mean_ms2_mass_error contains negative values" + + assert (transformed_df['extra_column'] == df['extra_column']).all(), "extra_column should not be transformed" + + +@pytest.fixture +def setup_data(): + + reference_df = pd.DataFrame({ + 'decoy': [0, 1], + 'rank': [1, 0], + 'elution_group_idx': [100, 101] + }) + + full_df = pd.DataFrame({ + 'decoy': [ 0, 0, 1, 1, 0], + 'rank': [ 1, 0, 2, 1, 2], + 'elution_group_idx': [100, 101, 102, 100, 102], + 'intensity': [200, 150, 120, 130, 95], + 'peptide': ['pepA', 'pepB', 'pepC', 'pepD', 'pepE'] + + }) + + return reference_df, full_df + + +def test_get_target_decoy_partners_correct_extraction(setup_data): + reference_df, full_df = setup_data + group_columns = ['elution_group_idx', 'rank'] + result_df = get_target_decoy_partners(reference_df, full_df, group_by=group_columns) + + assert len(result_df) == 3 # should match rows with ("rank", "elution_group_idx")=(1,100) and (2,101) + assert all(col in result_df.columns for col in full_df.columns) + + assert Counter(result_df['decoy']) == Counter([0, 0, 1]) + assert Counter(result_df['peptide']) == Counter(['pepA', 'pepB', 'pepD']) + + +def test_handling_nonexistent_partners_in_get_target_decoy_partners_(setup_data): + reference_df, full_df = setup_data + + reference_df.loc[1] = [0, 3, 104] + result_df = get_target_decoy_partners(reference_df, full_df) + + assert len(result_df) == 3 + assert not result_df[(result_df['rank'] == 3) & (result_df['elution_group_idx'] == 104)].empty == True + + + +@pytest.mark.parametrize( + ["fdr", "remove_decoys", "expected_length", "expected_decoy_count"], + [ + (0.5, True, 3, 0), + (0.01, True, 1, 0), + (0.5, False, 4, 1), + (0.01, False, 1, 0), + ] +) +def test_compute_and_filter_q_values(fdr, remove_decoys, expected_length, expected_decoy_count): + df = pd.DataFrame({ + 'proba': [0.1, 0.3, 0.8, 0.9, 0.9, 0.2, 0.4, 0.5], + 'decoy': [0, 1, 0, 1, 0, 1, 0, 1], + 'group': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'] + }) + result = compute_and_filter_q_values(df, fdr=fdr, group_columns=['group'], remove_decoys=remove_decoys) + print(result) + assert len(result) == expected_length + assert len(result[result['decoy'] == 1]) == expected_decoy_count From ef6fc45b7cc7995db7d63620f6a04357845571bf Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 21 Jan 2025 11:26:41 +0100 Subject: [PATCH 19/24] formatting --- tests/unit_tests/test_fdrx_models.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/unit_tests/test_fdrx_models.py b/tests/unit_tests/test_fdrx_models.py index 0ec48aa0..d91e2ace 100644 --- a/tests/unit_tests/test_fdrx_models.py +++ b/tests/unit_tests/test_fdrx_models.py @@ -25,22 +25,22 @@ def test_apply_absolute_transformations(): @pytest.fixture def setup_data(): - + reference_df = pd.DataFrame({ - 'decoy': [0, 1], + 'decoy': [0, 1], 'rank': [1, 0], 'elution_group_idx': [100, 101] }) - + full_df = pd.DataFrame({ - 'decoy': [ 0, 0, 1, 1, 0], + 'decoy': [ 0, 0, 1, 1, 0], 'rank': [ 1, 0, 2, 1, 2], 'elution_group_idx': [100, 101, 102, 100, 102], 'intensity': [200, 150, 120, 130, 95], 'peptide': ['pepA', 'pepB', 'pepC', 'pepD', 'pepE'] - + }) - + return reference_df, full_df @@ -48,22 +48,22 @@ def test_get_target_decoy_partners_correct_extraction(setup_data): reference_df, full_df = setup_data group_columns = ['elution_group_idx', 'rank'] result_df = get_target_decoy_partners(reference_df, full_df, group_by=group_columns) - + assert len(result_df) == 3 # should match rows with ("rank", "elution_group_idx")=(1,100) and (2,101) assert all(col in result_df.columns for col in full_df.columns) - + assert Counter(result_df['decoy']) == Counter([0, 0, 1]) assert Counter(result_df['peptide']) == Counter(['pepA', 'pepB', 'pepD']) def test_handling_nonexistent_partners_in_get_target_decoy_partners_(setup_data): reference_df, full_df = setup_data - + reference_df.loc[1] = [0, 3, 104] result_df = get_target_decoy_partners(reference_df, full_df) - + assert len(result_df) == 3 - assert not result_df[(result_df['rank'] == 3) & (result_df['elution_group_idx'] == 104)].empty == True + assert not result_df[(result_df['rank'] == 3) & (result_df['elution_group_idx'] == 104)].empty == True From 584bab7e49f10c1da449f9b2d4b889612a3c9f38 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Tue, 21 Jan 2025 11:37:59 +0100 Subject: [PATCH 20/24] fix test for get_target_decoy_partners --- tests/unit_tests/test_fdrx_models.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit_tests/test_fdrx_models.py b/tests/unit_tests/test_fdrx_models.py index d91e2ace..13a12b45 100644 --- a/tests/unit_tests/test_fdrx_models.py +++ b/tests/unit_tests/test_fdrx_models.py @@ -56,15 +56,14 @@ def test_get_target_decoy_partners_correct_extraction(setup_data): assert Counter(result_df['peptide']) == Counter(['pepA', 'pepB', 'pepD']) -def test_handling_nonexistent_partners_in_get_target_decoy_partners_(setup_data): +def test_handling_nonexistent_partners_in_get_target_decoy_partners(setup_data): reference_df, full_df = setup_data reference_df.loc[1] = [0, 3, 104] result_df = get_target_decoy_partners(reference_df, full_df) - assert len(result_df) == 3 - assert not result_df[(result_df['rank'] == 3) & (result_df['elution_group_idx'] == 104)].empty == True - + assert len(result_df) == 2 + assert result_df[(result_df['rank'] == 3) & (result_df['elution_group_idx'] == 104)].empty == True @pytest.mark.parametrize( From c5e3eed39348be4bc44d4aca76eac6d8c03eff5a Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 24 Jan 2025 11:22:12 +0100 Subject: [PATCH 21/24] addressing PR comments --- alphadia/fdrx/models/__init__.py | 4 -- alphadia/fdrx/models/two_step_classifier.py | 41 ++++++++++++--------- alphadia/workflow/manager.py | 28 +++++++------- alphadia/workflow/peptidecentric.py | 3 +- 4 files changed, 39 insertions(+), 37 deletions(-) diff --git a/alphadia/fdrx/models/__init__.py b/alphadia/fdrx/models/__init__.py index 1e0053df..e69de29b 100644 --- a/alphadia/fdrx/models/__init__.py +++ b/alphadia/fdrx/models/__init__.py @@ -1,4 +0,0 @@ -from .logistic_regression import LogisticRegressionClassifier -from .two_step_classifier import TwoStepClassifier - -__all__ = ["LogisticRegressionClassifier", "TwoStepClassifier"] diff --git a/alphadia/fdrx/models/two_step_classifier.py b/alphadia/fdrx/models/two_step_classifier.py index 9551b005..9c2a2909 100644 --- a/alphadia/fdrx/models/two_step_classifier.py +++ b/alphadia/fdrx/models/two_step_classifier.py @@ -77,13 +77,13 @@ def fit_predict( DataFrame containing predictions and q-values """ - df = self.preprocess_data(df, x_cols) + df = self._preprocess_data(df, x_cols) best_result = None best_precursor_count = -1 for i in range(max_iterations): if self.first_classifier.fitted and i > 0: - df_train, df_predict = self.apply_filtering_with_first_classifier( + df_train, df_predict = self._apply_filtering_with_first_classifier( df, x_cols, group_columns ) self.second_classifier.epochs = 50 @@ -92,7 +92,7 @@ def fit_predict( df_predict = df self.second_classifier.epochs = 10 - predictions = self.train_and_apply_second_classifier( + predictions = self._train_and_apply_second_classifier( df_train, df_predict, x_cols, y_col, group_columns ) @@ -102,8 +102,8 @@ def fit_predict( if current_target_count < best_precursor_count: logger.info( - f"Stop training after iteration {i}, " - f"due to decreasing target count ({current_target_count} < {best_precursor_count})" + f"Stopping training after iteration {i}, " + f"due to decreased target count ({current_target_count} < {best_precursor_count})" ) return best_result @@ -112,22 +112,29 @@ def fit_predict( # Update first classifier if enough confident predictions if current_target_count > self.min_precursors_for_update: - self.update_first_classifier( + self._update_first_classifier( df_filtered, df, x_cols, y_col, group_columns ) else: + logger.info( + f"Stopping fitting after {i+1} / {max_iterations} iterations due to insufficient detected precursors to update the first classifier." + ) break + else: + logger.info( + f"Stopping fitting after reaching the maximum number of iterations: {max_iterations} / {max_iterations}." + ) return best_result - def preprocess_data(self, df: pd.DataFrame, x_cols: list[str]) -> pd.DataFrame: + def _preprocess_data(self, df: pd.DataFrame, x_cols: list[str]) -> pd.DataFrame: """ Prepare data by removing NaN values and applying absolute transformations. """ df.dropna(subset=x_cols, inplace=True) return apply_absolute_transformations(df) - def apply_filtering_with_first_classifier( + def _apply_filtering_with_first_classifier( self, df: pd.DataFrame, x_cols: list[str], group_columns: list[str] ) -> tuple[pd.DataFrame, pd.DataFrame]: """ @@ -141,7 +148,7 @@ def apply_filtering_with_first_classifier( return filtered_df, filtered_df - def train_and_apply_second_classifier( + def _train_and_apply_second_classifier( self, train_df: pd.DataFrame, predict_df: pd.DataFrame, @@ -150,19 +157,19 @@ def train_and_apply_second_classifier( group_columns: list[str], ) -> pd.DataFrame: """ - Train second_classifeir and apply it to get predictions. + Train second_classifier and apply it to get predictions. """ self.second_classifier.fit( train_df[x_cols].to_numpy().astype(np.float32), train_df[y_col].to_numpy().astype(np.float32), ) - X = predict_df[x_cols].to_numpy() - predict_df["proba"] = self.second_classifier.predict_proba(X)[:, 1] + x = predict_df[x_cols].to_numpy() + predict_df["proba"] = self.second_classifier.predict_proba(x)[:, 1] return compute_q_values(predict_df, group_columns) - def update_first_classifier( + def _update_first_classifier( self, subset_df: pd.DataFrame, full_df: pd.DataFrame, @@ -177,22 +184,22 @@ def update_first_classifier( """ df = get_target_decoy_partners(subset_df, full_df) - X = df[x_cols].to_numpy() + x = df[x_cols].to_numpy() y = df[y_col].to_numpy() previous_n_precursors = -1 if self.first_classifier.fitted: - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df["proba"] = self.first_classifier.predict_proba(x)[:, 1] df_targets = compute_and_filter_q_values( df, self.first_fdr_cutoff, group_columns ) previous_n_precursors = len(df_targets) previous_state_dict = self.first_classifier.to_state_dict() - self.first_classifier.fit(X, y) + self.first_classifier.fit(x, y) - df["proba"] = self.first_classifier.predict_proba(X)[:, 1] + df["proba"] = self.first_classifier.predict_proba(x)[:, 1] df_targets = compute_and_filter_q_values( df, self.first_fdr_cutoff, group_columns ) diff --git a/alphadia/workflow/manager.py b/alphadia/workflow/manager.py index 74eb7577..4766d474 100644 --- a/alphadia/workflow/manager.py +++ b/alphadia/workflow/manager.py @@ -1,5 +1,4 @@ # native imports -import contextlib import logging import os import pickle @@ -20,7 +19,7 @@ import alphadia from alphadia import fdr from alphadia.calibration.property import Calibration, calibration_model_provider -from alphadia.fdrx.models import TwoStepClassifier +from alphadia.fdrx.models.two_step_classifier import TwoStepClassifier from alphadia.workflow import reporting from alphadia.workflow.config import Config @@ -627,9 +626,7 @@ def __init__( self.feature_columns = feature_columns self.classifier_store = defaultdict(list) self.classifier_base = classifier_base - self.enable_two_step_classifier = isinstance( - classifier_base, TwoStepClassifier - ) + self.is_two_step_classifier = isinstance(classifier_base, TwoStepClassifier) self._current_version = -1 self.load_classifier_store() @@ -699,7 +696,7 @@ def fit_predict( classifier = self.get_classifier(available_columns, version) if decoy_strategy == "precursor": - if not self.enable_two_step_classifier: + if not self.is_two_step_classifier: psm_df = fdr.perform_fdr( classifier, available_columns, @@ -766,7 +763,6 @@ def fit_predict( raise ValueError(f"Invalid decoy_strategy: {decoy_strategy}") self.is_fitted = True - # n_precursor = len(psm_df[psm_df["qval"] <= 0.01]) self._current_version += 1 self.classifier_store[column_hash(available_columns)].append(classifier) @@ -815,15 +811,17 @@ def load_classifier_store(self, path: None | str = None): logger.info(f"Loading classifier store from {path}") - for file in os.listdir(path): - if file.endswith(".pth"): - classifier_hash = file.split(".")[0] - - if classifier_hash not in self.classifier_store: - classifier = deepcopy(self.classifier_base) - with contextlib.suppress(Exception): + if ( + not self.is_two_step_classifier + ): # TODO add pretrained model for TwoStepClassifier + for file in os.listdir(path): + if file.endswith(".pth"): + classifier_hash = file.split(".")[0] + + if classifier_hash not in self.classifier_store: + classifier = deepcopy(self.classifier_base) classifier.from_state_dict(torch.load(os.path.join(path, file))) - self.classifier_store[classifier_hash].append(classifier) + self.classifier_store[classifier_hash].append(classifier) def get_classifier(self, available_columns: list, version: int = -1): """Gets the classifier for a given set of feature columns and version. If the classifier is not found in the store, gets the base classifier instead. diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py index 7a793552..151763f1 100644 --- a/alphadia/workflow/peptidecentric.py +++ b/alphadia/workflow/peptidecentric.py @@ -15,7 +15,8 @@ # alphadia imports from alphadia import fragcomp, plexscoring, utils -from alphadia.fdrx.models import LogisticRegressionClassifier, TwoStepClassifier +from alphadia.fdrx.models.logistic_regression import LogisticRegressionClassifier +from alphadia.fdrx.models.two_step_classifier import TwoStepClassifier from alphadia.peakgroup import search from alphadia.workflow import base, manager, optimization from alphadia.workflow.config import Config From 21da1be230122dbb696c87541eb4f2a3d35bd89d Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 24 Jan 2025 11:36:46 +0100 Subject: [PATCH 22/24] addressing PR comments --- alphadia/fdrx/models/two_step_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alphadia/fdrx/models/two_step_classifier.py b/alphadia/fdrx/models/two_step_classifier.py index 9c2a2909..8f36ac28 100644 --- a/alphadia/fdrx/models/two_step_classifier.py +++ b/alphadia/fdrx/models/two_step_classifier.py @@ -164,7 +164,7 @@ def _train_and_apply_second_classifier( train_df[y_col].to_numpy().astype(np.float32), ) - x = predict_df[x_cols].to_numpy() + x = predict_df[x_cols].to_numpy().astype(np.float32) predict_df["proba"] = self.second_classifier.predict_proba(x)[:, 1] return compute_q_values(predict_df, group_columns) From 8c5547ad2d52a9650c1f06528c81103e194fbdb3 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 24 Jan 2025 12:09:02 +0100 Subject: [PATCH 23/24] fix formatting --- tests/unit_tests/test_fdrx_models.py | 97 +++++++++++++++++----------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/tests/unit_tests/test_fdrx_models.py b/tests/unit_tests/test_fdrx_models.py index 13a12b45..d917f927 100644 --- a/tests/unit_tests/test_fdrx_models.py +++ b/tests/unit_tests/test_fdrx_models.py @@ -1,59 +1,70 @@ +from collections import Counter + import pandas as pd import pytest -from collections import Counter -from alphadia.fdrx.models.two_step_classifier import apply_absolute_transformations, get_target_decoy_partners, compute_and_filter_q_values +from alphadia.fdrx.models.two_step_classifier import ( + apply_absolute_transformations, + compute_and_filter_q_values, + get_target_decoy_partners, +) def test_apply_absolute_transformations(): data = { - 'delta_rt': [-1, -2, 3], - 'top_3_ms2_mass_error': [-1, -2, -3], - 'mean_ms2_mass_error': [1, -2, 3], - 'extra_column': [-1, -2, -3] + "delta_rt": [-1, -2, 3], + "top_3_ms2_mass_error": [-1, -2, -3], + "mean_ms2_mass_error": [1, -2, 3], + "extra_column": [-1, -2, -3], } df = pd.DataFrame(data) transformed_df = apply_absolute_transformations(df) - assert (transformed_df['delta_rt'] >= 0).all(), "delta_rt contains negative values" - assert (transformed_df['top_3_ms2_mass_error'] >= 0).all(), "top_3_ms2_mass_error contains negative values" - assert (transformed_df['mean_ms2_mass_error'] >= 0).all(), "mean_ms2_mass_error contains negative values" + assert (transformed_df["delta_rt"] >= 0).all(), "delta_rt contains negative values" + assert ( + transformed_df["top_3_ms2_mass_error"] >= 0 + ).all(), "top_3_ms2_mass_error contains negative values" + assert ( + transformed_df["mean_ms2_mass_error"] >= 0 + ).all(), "mean_ms2_mass_error contains negative values" - assert (transformed_df['extra_column'] == df['extra_column']).all(), "extra_column should not be transformed" + assert ( + transformed_df["extra_column"] == df["extra_column"] + ).all(), "extra_column should not be transformed" @pytest.fixture def setup_data(): - - reference_df = pd.DataFrame({ - 'decoy': [0, 1], - 'rank': [1, 0], - 'elution_group_idx': [100, 101] - }) - - full_df = pd.DataFrame({ - 'decoy': [ 0, 0, 1, 1, 0], - 'rank': [ 1, 0, 2, 1, 2], - 'elution_group_idx': [100, 101, 102, 100, 102], - 'intensity': [200, 150, 120, 130, 95], - 'peptide': ['pepA', 'pepB', 'pepC', 'pepD', 'pepE'] - - }) + reference_df = pd.DataFrame( + {"decoy": [0, 1], "rank": [1, 0], "elution_group_idx": [100, 101]} + ) + + full_df = pd.DataFrame( + { + "decoy": [0, 0, 1, 1, 0], + "rank": [1, 0, 2, 1, 2], + "elution_group_idx": [100, 101, 102, 100, 102], + "intensity": [200, 150, 120, 130, 95], + "peptide": ["pepA", "pepB", "pepC", "pepD", "pepE"], + } + ) return reference_df, full_df def test_get_target_decoy_partners_correct_extraction(setup_data): reference_df, full_df = setup_data - group_columns = ['elution_group_idx', 'rank'] + group_columns = ["elution_group_idx", "rank"] result_df = get_target_decoy_partners(reference_df, full_df, group_by=group_columns) - assert len(result_df) == 3 # should match rows with ("rank", "elution_group_idx")=(1,100) and (2,101) + assert ( + len(result_df) == 3 + ) # should match rows with ("rank", "elution_group_idx")=(1,100) and (2,101) assert all(col in result_df.columns for col in full_df.columns) - assert Counter(result_df['decoy']) == Counter([0, 0, 1]) - assert Counter(result_df['peptide']) == Counter(['pepA', 'pepB', 'pepD']) + assert Counter(result_df["decoy"]) == Counter([0, 0, 1]) + assert Counter(result_df["peptide"]) == Counter(["pepA", "pepB", "pepD"]) def test_handling_nonexistent_partners_in_get_target_decoy_partners(setup_data): @@ -63,7 +74,9 @@ def test_handling_nonexistent_partners_in_get_target_decoy_partners(setup_data): result_df = get_target_decoy_partners(reference_df, full_df) assert len(result_df) == 2 - assert result_df[(result_df['rank'] == 3) & (result_df['elution_group_idx'] == 104)].empty == True + assert result_df[ + (result_df["rank"] == 3) & (result_df["elution_group_idx"] == 104) + ].empty @pytest.mark.parametrize( @@ -73,15 +86,21 @@ def test_handling_nonexistent_partners_in_get_target_decoy_partners(setup_data): (0.01, True, 1, 0), (0.5, False, 4, 1), (0.01, False, 1, 0), - ] + ], ) -def test_compute_and_filter_q_values(fdr, remove_decoys, expected_length, expected_decoy_count): - df = pd.DataFrame({ - 'proba': [0.1, 0.3, 0.8, 0.9, 0.9, 0.2, 0.4, 0.5], - 'decoy': [0, 1, 0, 1, 0, 1, 0, 1], - 'group': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'] - }) - result = compute_and_filter_q_values(df, fdr=fdr, group_columns=['group'], remove_decoys=remove_decoys) +def test_compute_and_filter_q_values( + fdr, remove_decoys, expected_length, expected_decoy_count +): + df = pd.DataFrame( + { + "proba": [0.1, 0.3, 0.8, 0.9, 0.9, 0.2, 0.4, 0.5], + "decoy": [0, 1, 0, 1, 0, 1, 0, 1], + "group": ["A", "A", "B", "B", "C", "C", "D", "D"], + } + ) + result = compute_and_filter_q_values( + df, fdr=fdr, group_columns=["group"], remove_decoys=remove_decoys + ) print(result) assert len(result) == expected_length - assert len(result[result['decoy'] == 1]) == expected_decoy_count + assert len(result[result["decoy"] == 1]) == expected_decoy_count From 3d602afd923c8514a2b77973f24da2c958309714 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 24 Jan 2025 12:12:10 +0100 Subject: [PATCH 24/24] addressing pr comments, private attributes --- alphadia/fdrx/models/logistic_regression.py | 2 +- alphadia/fdrx/models/two_step_classifier.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/alphadia/fdrx/models/logistic_regression.py b/alphadia/fdrx/models/logistic_regression.py index 48076622..4d43f687 100644 --- a/alphadia/fdrx/models/logistic_regression.py +++ b/alphadia/fdrx/models/logistic_regression.py @@ -115,7 +115,7 @@ def from_state_dict(self, state_dict: dict) -> None: """ self._fitted = state_dict["_fitted"] - if self.fitted: + if self._fitted: self.scaler = StandardScaler() self.scaler.mean_ = np.array(state_dict["scaler_mean"]) self.scaler.var_ = np.array(state_dict["scaler_var"]) diff --git a/alphadia/fdrx/models/two_step_classifier.py b/alphadia/fdrx/models/two_step_classifier.py index 8f36ac28..c33468d9 100644 --- a/alphadia/fdrx/models/two_step_classifier.py +++ b/alphadia/fdrx/models/two_step_classifier.py @@ -41,8 +41,8 @@ def __init__( self.first_fdr_cutoff = first_fdr_cutoff self.second_fdr_cutoff = second_fdr_cutoff - self.min_precursors_for_update = min_precursors_for_update - self.train_on_top_n = train_on_top_n + self._min_precursors_for_update = min_precursors_for_update + self._train_on_top_n = train_on_top_n def fit_predict( self, @@ -88,7 +88,7 @@ def fit_predict( ) self.second_classifier.epochs = 50 else: - df_train = df[df["rank"] < self.train_on_top_n] + df_train = df[df["rank"] < self._train_on_top_n] df_predict = df self.second_classifier.epochs = 10 @@ -111,7 +111,7 @@ def fit_predict( best_result = predictions # Update first classifier if enough confident predictions - if current_target_count > self.min_precursors_for_update: + if current_target_count > self._min_precursors_for_update: self._update_first_classifier( df_filtered, df, x_cols, y_col, group_columns ) @@ -232,7 +232,7 @@ def to_state_dict(self) -> dict: "second_classifier": self.second_classifier.to_state_dict(), "first_fdr_cutoff": self.first_fdr_cutoff, "second_fdr_cutoff": self.second_fdr_cutoff, - "train_on_top_n": self.train_on_top_n, + "train_on_top_n": self._train_on_top_n, } def from_state_dict(self, state_dict: dict) -> None: @@ -247,7 +247,7 @@ def from_state_dict(self, state_dict: dict) -> None: self.second_classifier.from_state_dict(state_dict["second_classifier"]) self.first_fdr_cutoff = state_dict["first_fdr_cutoff"] self.second_fdr_cutoff = state_dict["second_fdr_cutoff"] - self.train_on_top_n = state_dict["train_on_top_n"] + self._train_on_top_n = state_dict["train_on_top_n"] def compute_q_values(