Skip to content

Commit

Permalink
Merge pull request #451 from MannLabs/linting-for-two-step-classifier
Browse files Browse the repository at this point in the history
Adjust two-step-classifier files for stricter linting
  • Loading branch information
anna-charlotte authored Jan 24, 2025
2 parents c2b5c55 + d3a6429 commit 1a6909c
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 44 deletions.
16 changes: 7 additions & 9 deletions alphadia/fdrx/models/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Implements the Logistic Regression classifier for use within the Alphadia framework."""

import logging

import numpy as np
Expand All @@ -10,22 +12,24 @@


class LogisticRegressionClassifier(Classifier):
"""Binary classifier using a logistic regression model."""

def __init__(self) -> None:
"""Binary classifier using a logistic regression model."""
"""Initializing a binary classifier using a logistic regression model."""
self.scaler = StandardScaler()
self.model = LogisticRegression()
self._fitted = False

@property
def fitted(self) -> bool:
"""Return whether the classifier has been fitted."""
return self._fitted

def fit(self, x: np.ndarray, y: np.ndarray) -> None:
"""Fit the classifier to the data.
Parameters
----------
x : np.array, dtype=float
Training data of shape (n_samples, n_features).
Expand All @@ -42,13 +46,11 @@ def predict(self, x: np.ndarray) -> np.ndarray:
Parameters
----------
x : np.array, dtype=float
Data of shape (n_samples, n_features).
Returns
-------
y : np.array, dtype=float
Predicted class probabilities of shape (n_samples, n_classes).
Expand All @@ -61,13 +63,11 @@ def predict_proba(self, x: np.ndarray) -> np.ndarray:
Parameters
----------
x : np.array, dtype=float
Data of shape (n_samples, n_features).
Returns
-------
y : np.array, dtype=float
Predicted class probabilities of shape (n_samples, n_classes).
Expand All @@ -80,7 +80,6 @@ def to_state_dict(self) -> dict:
Returns
-------
dict : dict
Dictionary containing the state of the classifier.
Expand Down Expand Up @@ -108,8 +107,7 @@ def from_state_dict(self, state_dict: dict) -> None:
Parameters
----------
dict : dict
state_dict : dict
Dictionary containing the state of the classifier.
"""
Expand Down
60 changes: 27 additions & 33 deletions alphadia/fdrx/models/two_step_classifier.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Implements the Two Step Classifier for use within the Alphadia framework."""

import logging

import numpy as np
Expand All @@ -10,7 +12,9 @@


class TwoStepClassifier:
def __init__(
"""A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage."""

def __init__( # noqa: PLR0913 Too many arguments in function definition (> 5)
self,
first_classifier: Classifier,
second_classifier: Classifier,
Expand All @@ -20,8 +24,7 @@ def __init__(
max_iterations: int = 5,
train_on_top_n: int = 1,
):
"""
A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage.
"""Initializing a two-step classifier.
Parameters
----------
Expand Down Expand Up @@ -57,8 +60,8 @@ def fit_predict(
y_col: str = "decoy",
group_columns: list[str] | None = None,
) -> pd.DataFrame:
"""
Train the two-step classifier and predict precursors using an iterative approach:
"""Train the two-step classifier and predict precursors using an iterative approach.
1. First iteration: Train neural network on top-n candidates.
2. Subsequent iterations: Use linear classifier to filter data, then refine with neural network.
3. Update linear classifier if enough high-confidence predictions are found, else break.
Expand Down Expand Up @@ -131,18 +134,14 @@ def fit_predict(
return best_result

def _preprocess_data(self, df: pd.DataFrame, x_cols: list[str]) -> pd.DataFrame:
"""
Prepare data by removing NaN values and applying absolute transformations.
"""
"""Prepare data by removing NaN values and applying absolute transformations."""
df.dropna(subset=x_cols, inplace=True)
return apply_absolute_transformations(df)

def _apply_filtering_with_first_classifier(
self, df: pd.DataFrame, x_cols: list[str], group_columns: list[str]
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Apply first classifier to filter data for the training of the second classifier.
"""
"""Apply first classifier to filter data for the training of the second classifier."""
df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1]

filtered_df = compute_and_filter_q_values(
Expand All @@ -159,9 +158,7 @@ def _train_and_apply_second_classifier(
y_col: str,
group_columns: list[str],
) -> pd.DataFrame:
"""
Train second_classifier and apply it to get predictions.
"""
"""Train second_classifier and apply it to get predictions."""
self.second_classifier.fit(
train_df[x_cols].to_numpy().astype(np.float32),
train_df[y_col].to_numpy().astype(np.float32),
Expand All @@ -180,10 +177,10 @@ def _update_first_classifier(
y_col: str,
group_columns: list[str],
) -> None:
"""
Update first classifier by finding and using target/decoy pairs. First extracts the corresponding
target/decoy partners from the full dataset for each entry in the subset, then uses these
pairs to retrain the classifier.
"""Update first classifier by finding and using target/decoy pairs.
First extracts the corresponding target/decoy partners from the full dataset
for each entry in the subset, then uses these pairs to retrain the classifier.
"""
df = get_target_decoy_partners(subset_df, full_df)

Expand Down Expand Up @@ -229,6 +226,7 @@ def to_state_dict(self) -> dict:
-------
dict
State dictionary containing both classifiers
"""
return {
"first_classifier": self.first_classifier.to_state_dict(),
Expand All @@ -245,6 +243,7 @@ def from_state_dict(self, state_dict: dict) -> None:
----------
state_dict : dict
State dictionary containing both classifiers
"""
self.first_classifier.from_state_dict(state_dict["first_classifier"])
self.second_classifier.from_state_dict(state_dict["second_classifier"])
Expand All @@ -256,19 +255,14 @@ def from_state_dict(self, state_dict: dict) -> None:
def compute_q_values(
df: pd.DataFrame, group_columns: list[str] | None = None
) -> pd.DataFrame:
"""
Compute q-values for each entry after keeping only best entries per group.
"""
"""Compute q-values for each entry after keeping only best entries per group."""
df.sort_values("proba", ascending=True, inplace=True)
df = keep_best(df, group_columns=group_columns)
return get_q_values(df, "proba", "decoy")


def filter_by_qval(df: pd.DataFrame, fdr_cutoff: float) -> pd.DataFrame:
"""
Filter dataframe by q-value threshold. If no entries pass the threshold,
return the single target entry with lowest q-value.
"""
"""Filter dataframe by q-value threshold. If no entries pass the threshold, return the single target entry with lowest q-value."""
df_filtered = df[df["qval"] < fdr_cutoff]

if len(df_filtered) == 0:
Expand All @@ -282,10 +276,11 @@ def compute_and_filter_q_values(
df: pd.DataFrame,
fdr: float,
group_columns: list[str] | None = None,
*, # This line makes all following arguments keyword-only
remove_decoys: bool = True,
) -> pd.DataFrame:
"""
Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries.
"""Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries.
If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value.
"""
df = compute_q_values(df, group_columns)
Expand All @@ -297,8 +292,8 @@ def compute_and_filter_q_values(
def get_target_decoy_partners(
reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None
) -> pd.DataFrame:
"""
Identifies and returns the corresponding target and decoy partner rows in full_df given the subset reference_df.
"""Identifies and returns the corresponding target and decoy partner rows in full_df given the subset reference_df.
This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index.
Parameters
Expand All @@ -319,16 +314,14 @@ def get_target_decoy_partners(
if group_by is None:
group_by = ["rank", "elution_group_idx"]
valid_tuples = reference_df[group_by]
matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner")

return matching_rows
return full_df.merge(valid_tuples, on=group_by, how="inner")


def apply_absolute_transformations(
df: pd.DataFrame, columns: list[str] | None = None
) -> pd.DataFrame:
"""
Applies absolute value transformations to predefined columns in a DataFrame inplace.
"""Applies absolute value transformations to predefined columns in a DataFrame inplace.
Parameters
----------
Expand All @@ -341,6 +334,7 @@ def apply_absolute_transformations(
-------
pd.DataFrame
The transformed DataFrame.
"""
if columns is None:
columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"]
Expand Down
3 changes: 1 addition & 2 deletions ruff-lint-strict.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ select = [

# TODO excluding explicity is not great but it is a workaround for now
exclude = [
"alphadia/fdrx/models/two_step_classifier.py",
"alphadia/fdrx/models/logistic_regression.py",
"alphadia/__*__.py",
"alphadia/cli.py",
"alphadia/exceptions.py",
Expand Down Expand Up @@ -84,4 +82,5 @@ ignore = [
"B023", # Function definition does not bind loop variable
"PD901", # Avoid using the generic variable name `df` for DataFrames"
"TCH003" # Move standard library import into a type-checking block

]

0 comments on commit 1a6909c

Please sign in to comment.