Merge pull request #217 from MannLabs/development

Release 1.7.0
MannLabs · Jun 15, 2024 · 3c3090c · 3c3090c
2 parents c6f3092 + 8b4cd69
commit 3c3090c
Show file tree

Hide file tree

Showing 45 changed files with 2,432 additions and 1,751 deletions.
diff --git a/alphadia/__init__.py b/alphadia/__init__.py
@@ -1,3 +1,3 @@
 #!python
 
-__version__ = "1.6.2"
+__version__ = "1.7.0"
diff --git a/alphadia/cli.py b/alphadia/cli.py
@@ -2,6 +2,8 @@
 
 # native imports
 import logging
+import sys
+
 import yaml
 import os
 import re
@@ -314,13 +316,13 @@ def run(*args, **kwargs):
     for f in raw_path_list:
         logger.progress(f"  {os.path.basename(f)}")
 
-    logger.progress(f"Using library: {library_path}.")
+    logger.progress(f"Using library: {library_path}")
 
     logger.progress(f"Using {len(fasta_path_list)} fasta files:")
     for f in fasta_path_list:
         logger.progress(f"  {f}")
 
-    logger.progress(f"Saving output to {output_directory}.")
+    logger.progress(f"Saving output to: {output_directory}")
 
     try:
         import matplotlib
@@ -345,3 +347,4 @@ def run(*args, **kwargs):
 
         logger.info(traceback.format_exc())
         logger.error(e)
+        sys.exit(1)
diff --git a/alphadia/consensus/utils.py b/alphadia/consensus/utils.py
@@ -0,0 +1,81 @@
+import logging
+import os
+import pandas as pd
+
+logger = logging.getLogger()
+supported_formats = ["parquet", "tsv"]
+
+
+def read_df(path_no_format, file_format="parquet"):
+    """Read dataframe from disk with choosen file format
+
+    Parameters
+    ----------
+
+    path_no_format: str
+        File to read from disk without file format
+
+    file_format: str, default = 'parquet'
+        File format for loading the file. Available options: ['parquet', 'tsv']
+
+    Returns
+    -------
+
+    pd.DataFrame
+        loaded dataframe from disk
+
+    """
+
+    file_path = f"{path_no_format}.{file_format}"
+
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"Can't load file as file was not found: {file_path}")
+
+    logger.info(f"Reading {file_path} from disk")
+
+    if file_format == "parquet":
+        return pd.read_parquet(file_path)
+
+    elif file_format == "tsv":
+        return pd.read_csv(file_path, sep="\t")
+
+    else:
+        raise ValueError(
+            f"Provided unknown file format: {file_format}, supported_formats: {supported_formats}"
+        )
+
+
+def write_df(df, path_no_format, file_format="parquet"):
+    """Read dataframe from disk with choosen file format
+
+    Parameters
+    ----------
+
+    df: pd.DataFrame
+        Dataframe to save to disk
+
+    path_no_format: str
+        Path for file without format
+
+    file_format: str, default = 'parquet'
+        File format for loading the file. Available options: ['parquet', 'tsv']
+
+    """
+
+    if file_format not in supported_formats:
+        raise ValueError(
+            f"Provided unknown file format: {file_format}, supported_formats: {supported_formats}"
+        )
+
+    file_path = f"{path_no_format}.{file_format}"
+
+    logger.info(f"Saving {file_path} to disk")
+
+    if file_format == "parquet":
+        df.to_parquet(file_path, index=False)
+
+    elif file_format == "tsv":
+        df.to_csv(file_path, sep="\t", index=False, float_format="%.6f")
+
+    else:
+        raise ValueError("I don't know how you ended up here")
diff --git a/alphadia/constants/default.yaml b/alphadia/constants/default.yaml
@@ -150,6 +150,8 @@ search_output:
   num_samples_quadratic: 50
   min_nonnan: 3
   normalize_lfq: True
+  # can be either "parquet" or "tsv"
+  file_format: "tsv"
 
 # configuration for the optimization manager
 # initial parameters, will nbe optimized
@@ -160,8 +162,8 @@ optimization_manager:
 
 # This section controls transfer learning
 # currently only the library is created with transfer learning
-transfer_learning:
-  # if true, the library is created with transfer learning
+transfer_library:
+  # if true, the library is created for transfer learning
   enabled: False
 
   # semicolon separated list of fragment types to include in the library. possible values are 'a', 'b', 'c', 'x', 'y', 'z'
@@ -185,6 +187,39 @@ transfer_learning:
   # include only fragments with a XIC correlation at least 0.75 of the median for all fragments
   fragment_correlation_ratio: 0.75
 
+transfer_learning:
+
+  # if true, a custom peptdeep model will be created using the transfer learned library
+  enabled: False
+
+  # number of precursors per batch
+  batch_size: 2000
+
+  # maximum learning rate per batch.
+  # The maximum learning rate will be reached after a warmup phase and decreased using a plateau scheduler
+  max_lr: 0.0001
+
+  # TODO remove and replaced by fixed 70:20:10 split
+  train_ratio: 0.8
+
+  # test every n intervals
+  test_interval: 1
+
+  # learning rate patience after which the lr will be halved
+  lr_patience: 3
+
+  # maximum number of epochs
+  epochs: 51
+
+  # number of warmup epochs during which the lr is ramped up
+  warmup_epochs: 5
+
+  # normalised collision energy encoded during training
+  nce: 25
+
+  # instrument type encoded during training
+  instrument: 'Lumos'
+
 # configuration for the calibration manager
 # the config has to start with the calibration keyword and consists of a list of calibration groups.
 # each group consists of datapoints which have multiple properties.

diff --git a/alphadia/data/alpharaw.py b/alphadia/data/alpharaw.py
@@ -8,6 +8,8 @@
 # alphadia imports
 from alphadia import utils
 
+from alphadia.data.stats import log_stats
+
 # alpha family imports
 from alpharaw import thermo as alpharawthermo
 from alpharaw import sciex as alpharawsciex
@@ -339,20 +341,23 @@ def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
         super().__init__(process_count=process_count)
         self.load_raw(raw_file_path)
         self.process_alpharaw(**kwargs)
+        log_stats(self.rt_values, self.cycle)
 
 
 class Sciex(AlphaRaw, alpharawsciex.SciexWiffData):
     def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
         super().__init__(process_count=process_count)
         self.load_raw(raw_file_path)
         self.process_alpharaw(**kwargs)
+        log_stats(self.rt_values, self.cycle)
 
 
 class Thermo(AlphaRaw, alpharawthermo.ThermoRawData):
     def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
         super().__init__(process_count=process_count)
         self.load_raw(raw_file_path)
         self.process_alpharaw(**kwargs)
+        log_stats(self.rt_values, self.cycle)
 
     def filter_spectra(self, cv: float = None, astral_ms1: bool = False, **kwargs):
         """

diff --git a/alphadia/data/bruker.py b/alphadia/data/bruker.py
@@ -7,6 +7,7 @@
 
 # alphadia imports
 from alphadia import utils
+from alphadia.data.stats import log_stats
 
 # alpha family imports
 import alphatims.utils
@@ -94,6 +95,7 @@ def __init__(
 
         # Precompile
         logger.info(f"Successfully imported data from {bruker_d_folder_name}")
+        log_stats(self.rt_values, self.cycle)
 
     def transpose(self):
         # abort if transposed data is already present
@@ -655,9 +657,7 @@ def assemble_push(
                                 relative_precursor_index[i],
                                 relative_scan,
                                 relative_precursor,
-                            ] = (
-                                accumulated_intensity + new_intensity
-                            )
+                            ] = accumulated_intensity + new_intensity
                             dense_output[
                                 1,
                                 j,

diff --git a/alphadia/data/stats.py b/alphadia/data/stats.py
@@ -0,0 +1,44 @@
+import numpy as np
+import logging
+
+logger = logging.getLogger()
+
+
+def log_stats(rt_values: np.array, cycle: np.array):
+    """Log raw file statistics
+
+    Parameters
+    ----------
+
+    rt_values: np.ndarray
+            retention time values in seconds for all frames
+
+    cycle: np.ndarray
+            DIA cycle object describing the msms pattern
+    """
+
+    logger.info(f"============ Raw file stats ============")
+
+    rt_limits = rt_values.min() / 60, rt_values.max() / 60
+    rt_duration_sec = rt_values.max() - rt_values.min()
+    rt_duration_min = rt_duration_sec / 60
+
+    logger.info(f"{'RT (min)':<20}: {rt_limits[0]:.1f} - {rt_limits[1]:.1f}")
+    logger.info(f"{'RT duration (sec)':<20}: {rt_duration_sec:.1f}")
+    logger.info(f"{'RT duration (min)':<20}: {rt_duration_min:.1f}")
+
+    cycle_length = cycle.shape[1]
+    cycle_duration = np.diff(rt_values[::cycle_length]).mean()
+    cycle_number = len(rt_values) // cycle_length
+
+    logger.info(f"{'Cycle len (scans)':<20}: {cycle_length:.0f}")
+    logger.info(f"{'Cycle len (sec)':<20}: {cycle_duration:.2f}")
+    logger.info(f"{'Number of cycles':<20}: {cycle_number:.0f}")
+
+    flat_cycle = cycle.flatten()
+    flat_cycle = flat_cycle[flat_cycle > 0]
+    msms_range = flat_cycle.min(), flat_cycle.max()
+
+    logger.info(f"{'MS2 range (m/z)':<20}: {msms_range[0]:.1f} - {msms_range[1]:.1f}")
+
+    logger.info(f"========================================")