diff --git a/alphadia/fdrx/models/logistic_regression.py b/alphadia/fdrx/models/logistic_regression.py index 4d43f687..76c00fdb 100644 --- a/alphadia/fdrx/models/logistic_regression.py +++ b/alphadia/fdrx/models/logistic_regression.py @@ -1,3 +1,5 @@ +"""Implements the Logistic Regression classifier for use within the Alphadia framework.""" + import logging import numpy as np @@ -10,14 +12,17 @@ class LogisticRegressionClassifier(Classifier): + """Binary classifier using a logistic regression model.""" + def __init__(self) -> None: - """Binary classifier using a logistic regression model.""" + """Initializing a binary classifier using a logistic regression model.""" self.scaler = StandardScaler() self.model = LogisticRegression() self._fitted = False @property def fitted(self) -> bool: + """Return whether the classifier has been fitted.""" return self._fitted def fit(self, x: np.ndarray, y: np.ndarray) -> None: @@ -25,7 +30,6 @@ def fit(self, x: np.ndarray, y: np.ndarray) -> None: Parameters ---------- - x : np.array, dtype=float Training data of shape (n_samples, n_features). @@ -42,13 +46,11 @@ def predict(self, x: np.ndarray) -> np.ndarray: Parameters ---------- - x : np.array, dtype=float Data of shape (n_samples, n_features). Returns ------- - y : np.array, dtype=float Predicted class probabilities of shape (n_samples, n_classes). @@ -61,13 +63,11 @@ def predict_proba(self, x: np.ndarray) -> np.ndarray: Parameters ---------- - x : np.array, dtype=float Data of shape (n_samples, n_features). Returns ------- - y : np.array, dtype=float Predicted class probabilities of shape (n_samples, n_classes). @@ -80,7 +80,6 @@ def to_state_dict(self) -> dict: Returns ------- - dict : dict Dictionary containing the state of the classifier. @@ -108,8 +107,7 @@ def from_state_dict(self, state_dict: dict) -> None: Parameters ---------- - - dict : dict + state_dict : dict Dictionary containing the state of the classifier. """ diff --git a/alphadia/fdrx/models/two_step_classifier.py b/alphadia/fdrx/models/two_step_classifier.py index 8e21e963..4a465ded 100644 --- a/alphadia/fdrx/models/two_step_classifier.py +++ b/alphadia/fdrx/models/two_step_classifier.py @@ -1,3 +1,5 @@ +"""Implements the Two Step Classifier for use within the Alphadia framework.""" + import logging import numpy as np @@ -10,7 +12,9 @@ class TwoStepClassifier: - def __init__( + """A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage.""" + + def __init__( # noqa: PLR0913 Too many arguments in function definition (> 5) self, first_classifier: Classifier, second_classifier: Classifier, @@ -20,8 +24,7 @@ def __init__( max_iterations: int = 5, train_on_top_n: int = 1, ): - """ - A two-step classifier, designed to refine classification results by applying a stricter second-stage classification after an initial filtering stage. + """Initializing a two-step classifier. Parameters ---------- @@ -57,8 +60,8 @@ def fit_predict( y_col: str = "decoy", group_columns: list[str] | None = None, ) -> pd.DataFrame: - """ - Train the two-step classifier and predict precursors using an iterative approach: + """Train the two-step classifier and predict precursors using an iterative approach. + 1. First iteration: Train neural network on top-n candidates. 2. Subsequent iterations: Use linear classifier to filter data, then refine with neural network. 3. Update linear classifier if enough high-confidence predictions are found, else break. @@ -131,18 +134,14 @@ def fit_predict( return best_result def _preprocess_data(self, df: pd.DataFrame, x_cols: list[str]) -> pd.DataFrame: - """ - Prepare data by removing NaN values and applying absolute transformations. - """ + """Prepare data by removing NaN values and applying absolute transformations.""" df.dropna(subset=x_cols, inplace=True) return apply_absolute_transformations(df) def _apply_filtering_with_first_classifier( self, df: pd.DataFrame, x_cols: list[str], group_columns: list[str] ) -> tuple[pd.DataFrame, pd.DataFrame]: - """ - Apply first classifier to filter data for the training of the second classifier. - """ + """Apply first classifier to filter data for the training of the second classifier.""" df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1] filtered_df = compute_and_filter_q_values( @@ -159,9 +158,7 @@ def _train_and_apply_second_classifier( y_col: str, group_columns: list[str], ) -> pd.DataFrame: - """ - Train second_classifier and apply it to get predictions. - """ + """Train second_classifier and apply it to get predictions.""" self.second_classifier.fit( train_df[x_cols].to_numpy().astype(np.float32), train_df[y_col].to_numpy().astype(np.float32), @@ -180,10 +177,10 @@ def _update_first_classifier( y_col: str, group_columns: list[str], ) -> None: - """ - Update first classifier by finding and using target/decoy pairs. First extracts the corresponding - target/decoy partners from the full dataset for each entry in the subset, then uses these - pairs to retrain the classifier. + """Update first classifier by finding and using target/decoy pairs. + + First extracts the corresponding target/decoy partners from the full dataset + for each entry in the subset, then uses these pairs to retrain the classifier. """ df = get_target_decoy_partners(subset_df, full_df) @@ -229,6 +226,7 @@ def to_state_dict(self) -> dict: ------- dict State dictionary containing both classifiers + """ return { "first_classifier": self.first_classifier.to_state_dict(), @@ -245,6 +243,7 @@ def from_state_dict(self, state_dict: dict) -> None: ---------- state_dict : dict State dictionary containing both classifiers + """ self.first_classifier.from_state_dict(state_dict["first_classifier"]) self.second_classifier.from_state_dict(state_dict["second_classifier"]) @@ -256,19 +255,14 @@ def from_state_dict(self, state_dict: dict) -> None: def compute_q_values( df: pd.DataFrame, group_columns: list[str] | None = None ) -> pd.DataFrame: - """ - Compute q-values for each entry after keeping only best entries per group. - """ + """Compute q-values for each entry after keeping only best entries per group.""" df.sort_values("proba", ascending=True, inplace=True) df = keep_best(df, group_columns=group_columns) return get_q_values(df, "proba", "decoy") def filter_by_qval(df: pd.DataFrame, fdr_cutoff: float) -> pd.DataFrame: - """ - Filter dataframe by q-value threshold. If no entries pass the threshold, - return the single target entry with lowest q-value. - """ + """Filter dataframe by q-value threshold. If no entries pass the threshold, return the single target entry with lowest q-value.""" df_filtered = df[df["qval"] < fdr_cutoff] if len(df_filtered) == 0: @@ -282,10 +276,11 @@ def compute_and_filter_q_values( df: pd.DataFrame, fdr: float, group_columns: list[str] | None = None, + *, # This line makes all following arguments keyword-only remove_decoys: bool = True, ) -> pd.DataFrame: - """ - Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries. + """Returns entries in the DataFrame based on the FDR threshold and optionally removes decoy entries. + If no entries are found below the FDR threshold after filtering, returns the single best entry based on the q-value. """ df = compute_q_values(df, group_columns) @@ -297,8 +292,8 @@ def compute_and_filter_q_values( def get_target_decoy_partners( reference_df: pd.DataFrame, full_df: pd.DataFrame, group_by: list[str] | None = None ) -> pd.DataFrame: - """ - Identifies and returns the corresponding target and decoy partner rows in full_df given the subset reference_df. + """Identifies and returns the corresponding target and decoy partner rows in full_df given the subset reference_df. + This function is typically used to find target-decoy partners based on certain criteria like rank and elution group index. Parameters @@ -319,16 +314,14 @@ def get_target_decoy_partners( if group_by is None: group_by = ["rank", "elution_group_idx"] valid_tuples = reference_df[group_by] - matching_rows = full_df.merge(valid_tuples, on=group_by, how="inner") - return matching_rows + return full_df.merge(valid_tuples, on=group_by, how="inner") def apply_absolute_transformations( df: pd.DataFrame, columns: list[str] | None = None ) -> pd.DataFrame: - """ - Applies absolute value transformations to predefined columns in a DataFrame inplace. + """Applies absolute value transformations to predefined columns in a DataFrame inplace. Parameters ---------- @@ -341,6 +334,7 @@ def apply_absolute_transformations( ------- pd.DataFrame The transformed DataFrame. + """ if columns is None: columns = ["delta_rt", "top_3_ms2_mass_error", "mean_ms2_mass_error"] diff --git a/ruff-lint-strict.toml b/ruff-lint-strict.toml index 063e5af9..4f3f5cb1 100644 --- a/ruff-lint-strict.toml +++ b/ruff-lint-strict.toml @@ -6,8 +6,6 @@ select = [ # TODO excluding explicity is not great but it is a workaround for now exclude = [ - "alphadia/fdrx/models/two_step_classifier.py", - "alphadia/fdrx/models/logistic_regression.py", "alphadia/__*__.py", "alphadia/cli.py", "alphadia/exceptions.py", @@ -84,4 +82,5 @@ ignore = [ "B023", # Function definition does not bind loop variable "PD901", # Avoid using the generic variable name `df` for DataFrames" "TCH003" # Move standard library import into a type-checking block + ]