Skip to content

Commit

Permalink
Merge pull request #217 from MannLabs/development
Browse files Browse the repository at this point in the history
Release 1.7.0
  • Loading branch information
GeorgWa authored Jun 15, 2024
2 parents c6f3092 + 8b4cd69 commit 3c3090c
Show file tree
Hide file tree
Showing 45 changed files with 2,432 additions and 1,751 deletions.
2 changes: 1 addition & 1 deletion alphadia/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!python

__version__ = "1.6.2"
__version__ = "1.7.0"
7 changes: 5 additions & 2 deletions alphadia/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

# native imports
import logging
import sys

import yaml
import os
import re
Expand Down Expand Up @@ -314,13 +316,13 @@ def run(*args, **kwargs):
for f in raw_path_list:
logger.progress(f" {os.path.basename(f)}")

logger.progress(f"Using library: {library_path}.")
logger.progress(f"Using library: {library_path}")

logger.progress(f"Using {len(fasta_path_list)} fasta files:")
for f in fasta_path_list:
logger.progress(f" {f}")

logger.progress(f"Saving output to {output_directory}.")
logger.progress(f"Saving output to: {output_directory}")

try:
import matplotlib
Expand All @@ -345,3 +347,4 @@ def run(*args, **kwargs):

logger.info(traceback.format_exc())
logger.error(e)
sys.exit(1)
81 changes: 81 additions & 0 deletions alphadia/consensus/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import logging
import os
import pandas as pd

logger = logging.getLogger()
supported_formats = ["parquet", "tsv"]


def read_df(path_no_format, file_format="parquet"):
"""Read dataframe from disk with choosen file format
Parameters
----------
path_no_format: str
File to read from disk without file format
file_format: str, default = 'parquet'
File format for loading the file. Available options: ['parquet', 'tsv']
Returns
-------
pd.DataFrame
loaded dataframe from disk
"""

file_path = f"{path_no_format}.{file_format}"

if not os.path.exists(file_path):
raise FileNotFoundError(f"Can't load file as file was not found: {file_path}")

logger.info(f"Reading {file_path} from disk")

if file_format == "parquet":
return pd.read_parquet(file_path)

elif file_format == "tsv":
return pd.read_csv(file_path, sep="\t")

else:
raise ValueError(
f"Provided unknown file format: {file_format}, supported_formats: {supported_formats}"
)


def write_df(df, path_no_format, file_format="parquet"):
"""Read dataframe from disk with choosen file format
Parameters
----------
df: pd.DataFrame
Dataframe to save to disk
path_no_format: str
Path for file without format
file_format: str, default = 'parquet'
File format for loading the file. Available options: ['parquet', 'tsv']
"""

if file_format not in supported_formats:
raise ValueError(
f"Provided unknown file format: {file_format}, supported_formats: {supported_formats}"
)

file_path = f"{path_no_format}.{file_format}"

logger.info(f"Saving {file_path} to disk")

if file_format == "parquet":
df.to_parquet(file_path, index=False)

elif file_format == "tsv":
df.to_csv(file_path, sep="\t", index=False, float_format="%.6f")

else:
raise ValueError("I don't know how you ended up here")
39 changes: 37 additions & 2 deletions alphadia/constants/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ search_output:
num_samples_quadratic: 50
min_nonnan: 3
normalize_lfq: True
# can be either "parquet" or "tsv"
file_format: "tsv"

# configuration for the optimization manager
# initial parameters, will nbe optimized
Expand All @@ -160,8 +162,8 @@ optimization_manager:

# This section controls transfer learning
# currently only the library is created with transfer learning
transfer_learning:
# if true, the library is created with transfer learning
transfer_library:
# if true, the library is created for transfer learning
enabled: False

# semicolon separated list of fragment types to include in the library. possible values are 'a', 'b', 'c', 'x', 'y', 'z'
Expand All @@ -185,6 +187,39 @@ transfer_learning:
# include only fragments with a XIC correlation at least 0.75 of the median for all fragments
fragment_correlation_ratio: 0.75

transfer_learning:

# if true, a custom peptdeep model will be created using the transfer learned library
enabled: False

# number of precursors per batch
batch_size: 2000

# maximum learning rate per batch.
# The maximum learning rate will be reached after a warmup phase and decreased using a plateau scheduler
max_lr: 0.0001

# TODO remove and replaced by fixed 70:20:10 split
train_ratio: 0.8

# test every n intervals
test_interval: 1

# learning rate patience after which the lr will be halved
lr_patience: 3

# maximum number of epochs
epochs: 51

# number of warmup epochs during which the lr is ramped up
warmup_epochs: 5

# normalised collision energy encoded during training
nce: 25

# instrument type encoded during training
instrument: 'Lumos'

# configuration for the calibration manager
# the config has to start with the calibration keyword and consists of a list of calibration groups.
# each group consists of datapoints which have multiple properties.
Expand Down
5 changes: 5 additions & 0 deletions alphadia/data/alpharaw.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# alphadia imports
from alphadia import utils

from alphadia.data.stats import log_stats

# alpha family imports
from alpharaw import thermo as alpharawthermo
from alpharaw import sciex as alpharawsciex
Expand Down Expand Up @@ -339,20 +341,23 @@ def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
super().__init__(process_count=process_count)
self.load_raw(raw_file_path)
self.process_alpharaw(**kwargs)
log_stats(self.rt_values, self.cycle)


class Sciex(AlphaRaw, alpharawsciex.SciexWiffData):
def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
super().__init__(process_count=process_count)
self.load_raw(raw_file_path)
self.process_alpharaw(**kwargs)
log_stats(self.rt_values, self.cycle)


class Thermo(AlphaRaw, alpharawthermo.ThermoRawData):
def __init__(self, raw_file_path: str, process_count: int = 10, **kwargs):
super().__init__(process_count=process_count)
self.load_raw(raw_file_path)
self.process_alpharaw(**kwargs)
log_stats(self.rt_values, self.cycle)

def filter_spectra(self, cv: float = None, astral_ms1: bool = False, **kwargs):
"""
Expand Down
6 changes: 3 additions & 3 deletions alphadia/data/bruker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# alphadia imports
from alphadia import utils
from alphadia.data.stats import log_stats

# alpha family imports
import alphatims.utils
Expand Down Expand Up @@ -94,6 +95,7 @@ def __init__(

# Precompile
logger.info(f"Successfully imported data from {bruker_d_folder_name}")
log_stats(self.rt_values, self.cycle)

def transpose(self):
# abort if transposed data is already present
Expand Down Expand Up @@ -655,9 +657,7 @@ def assemble_push(
relative_precursor_index[i],
relative_scan,
relative_precursor,
] = (
accumulated_intensity + new_intensity
)
] = accumulated_intensity + new_intensity
dense_output[
1,
j,
Expand Down
44 changes: 44 additions & 0 deletions alphadia/data/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import numpy as np
import logging

logger = logging.getLogger()


def log_stats(rt_values: np.array, cycle: np.array):
"""Log raw file statistics
Parameters
----------
rt_values: np.ndarray
retention time values in seconds for all frames
cycle: np.ndarray
DIA cycle object describing the msms pattern
"""

logger.info(f"============ Raw file stats ============")

rt_limits = rt_values.min() / 60, rt_values.max() / 60
rt_duration_sec = rt_values.max() - rt_values.min()
rt_duration_min = rt_duration_sec / 60

logger.info(f"{'RT (min)':<20}: {rt_limits[0]:.1f} - {rt_limits[1]:.1f}")
logger.info(f"{'RT duration (sec)':<20}: {rt_duration_sec:.1f}")
logger.info(f"{'RT duration (min)':<20}: {rt_duration_min:.1f}")

cycle_length = cycle.shape[1]
cycle_duration = np.diff(rt_values[::cycle_length]).mean()
cycle_number = len(rt_values) // cycle_length

logger.info(f"{'Cycle len (scans)':<20}: {cycle_length:.0f}")
logger.info(f"{'Cycle len (sec)':<20}: {cycle_duration:.2f}")
logger.info(f"{'Number of cycles':<20}: {cycle_number:.0f}")

flat_cycle = cycle.flatten()
flat_cycle = flat_cycle[flat_cycle > 0]
msms_range = flat_cycle.min(), flat_cycle.max()

logger.info(f"{'MS2 range (m/z)':<20}: {msms_range[0]:.1f} - {msms_range[1]:.1f}")

logger.info(f"========================================")
Loading

0 comments on commit 3c3090c

Please sign in to comment.