diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 272eb15..ffc2927 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,7 +23,7 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest - pip install --editable . + pip install --editable .[test] - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/MANIFEST.in b/MANIFEST.in index 74bda3e..50040f2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,3 @@ -include deeplc/expasy/* -include deeplc/mod_to_smiles/* include deeplc/mods/* include deeplc/package_data/**/* include deeplc/unimod/* diff --git a/deeplc/__main__.py b/deeplc/__main__.py index 4c0e3e8..eb6079f 100644 --- a/deeplc/__main__.py +++ b/deeplc/__main__.py @@ -12,17 +12,14 @@ import warnings import pandas as pd -from matplotlib import pyplot as plt - -from deeplc import __version__, DeepLC, FeatExtractor -from deeplc._argument_parser import parse_arguments -from deeplc._exceptions import DeepLCError - from psm_utils.io.peptide_record import peprec_to_proforma from psm_utils.psm import PSM from psm_utils.psm_list import PSMList from psm_utils.io import read_file -from psm_utils.io import write_file + +from deeplc import __version__, DeepLC, FeatExtractor +from deeplc._argument_parser import parse_arguments +from deeplc._exceptions import DeepLCError logger = logging.getLogger(__name__) diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index fd2d3d3..1142162 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -31,22 +31,18 @@ LIBRARY = {} -import os -import sys import copy import gc import logging +import math import multiprocessing import multiprocessing.dummy -import pickle +import random +import sys import warnings from configparser import ConfigParser -from tempfile import TemporaryDirectory -from copy import deepcopy -import random -import math -from collections import ChainMap from itertools import chain +from tempfile import TemporaryDirectory # If CLI/GUI/frozen: disable Tensorflow info and warnings before importing IS_CLI_GUI = os.path.basename(sys.argv[0]) in ["deeplc", "deeplc-gui"] @@ -65,29 +61,25 @@ import numpy as np import pandas as pd import tensorflow as tf -from tensorflow.python.eager import context -from tensorflow.keras.models import load_model -import h5py - -from deeplc._exceptions import CalibrationError, DeepLCError -from deeplc.trainl3 import train_en - +from deeplcretrainer import deeplcretrainer +from psm_utils.io import read_file from psm_utils.io.peptide_record import peprec_to_proforma from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -from psm_utils.io import read_file -from psm_utils.io import write_file +from tensorflow.keras.models import load_model +from tensorflow.python.eager import context -from deeplcretrainer import deeplcretrainer +from deeplc._exceptions import CalibrationError +from deeplc.trainl3 import train_en # "Custom" activation function lrelu = lambda x: tf.keras.activations.relu(x, alpha=0.1, max_value=20.0) try: - from tensorflow.compat.v1.keras.backend import set_session + from tensorflow.compat.v1.keras.backend import set_session # noqa: F401 except ImportError: - from tensorflow.keras.backend import set_session + from tensorflow.keras.backend import set_session # noqa: F401 try: from tensorflow.compat.v1.keras.backend import clear_session except ImportError: @@ -112,9 +104,10 @@ # session = tf.compat.v1.Session(config=config) # Feature extraction -from deeplc.feat_extractor import FeatExtractor from pygam import LinearGAM, s +from deeplc.feat_extractor import FeatExtractor + def warn(*args, **kwargs): pass diff --git a/deeplc/feat_extractor.py b/deeplc/feat_extractor.py index c1cb666..2e3f8ba 100644 --- a/deeplc/feat_extractor.py +++ b/deeplc/feat_extractor.py @@ -12,7 +12,6 @@ __email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"] # Native imports -from operator import index import os import math import time @@ -20,27 +19,14 @@ import ast from re import sub import logging -from copy import deepcopy -# Numpy import numpy as np - -# Pandas import pandas as pd - from psm_utils.io.peptide_record import peprec_to_proforma from psm_utils.psm import PSM from psm_utils.psm_list import PSMList from pyteomics import mass -from functools import lru_cache - -from psm_utils.io.peptide_record import peprec_to_proforma -from psm_utils.psm import PSM -from psm_utils.psm_list import PSMList -from psm_utils.io import read_file -from psm_utils.io import write_file - logger = logging.getLogger(__name__) diff --git a/deeplc/trainl3.py b/deeplc/trainl3.py index 2720fee..a28e083 100644 --- a/deeplc/trainl3.py +++ b/deeplc/trainl3.py @@ -12,35 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -This code is used to train retention time predictors and store -predictions from a CV procedure for further analysis. This project was made possible by MASSTRPLAN. MASSTRPLAN received funding from the Marie Sklodowska-Curie EU Framework for Research and Innovation Horizon 2020, under Grant Agreement No. 675132. """ -from sklearn.model_selection import RandomizedSearchCV -from sklearn.linear_model import ElasticNet -from sklearn.metrics import mean_absolute_error -from sklearn.feature_selection import SelectFromModel -from sklearn.model_selection import cross_val_predict -from sklearn.model_selection import KFold -from sklearn.base import clone -from sklearn.model_selection import GridSearchCV -from scipy.stats import randint -from scipy.stats import uniform -from numpy import arange -from scipy.stats import pearsonr +try: + from sklearn.base import clone + from sklearn.linear_model import ElasticNet + from sklearn.model_selection import GridSearchCV +except ImportError: + _has_sklearn = False +else: + _has_sklearn = True -from operator import itemgetter -from numpy import median -from collections import Counter - -def train_en(X,y,n_jobs=16,cv=None): + +def train_en(X, y, n_jobs=16, cv=None): """ Function that trains Layer 3 of CALLC (elastic net) - + Parameters ---------- X : pd.DataFrame @@ -51,7 +42,7 @@ def train_en(X,y,n_jobs=16,cv=None): number of jobs to spawn cv : sklearn.model_selection.KFold cv object - + Returns ------- sklearn.linear_model.ElasticNet @@ -61,35 +52,55 @@ def train_en(X,y,n_jobs=16,cv=None): list list with features used to train Layer 3 """ - preds = [] + if not _has_sklearn: + raise ImportError( + "This function requires the optional dependency `scikit-learn`. Run `pip install " + "scikit-learn` and try again." + ) model = ElasticNet() crossv_mod = clone(model) ret_mod = clone(model) - set_reg = [0.01,1.0,10.0,100.0,1000.0,10000.0,10000.0,100000.0,1000000.0,1000000000,1000000] - set_reg.extend([x/2 for x in set_reg]) - set_reg.extend([x/3 for x in set_reg]) - + set_reg = [ + 0.01, + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 10000.0, + 100000.0, + 1000000.0, + 1000000000, + 1000000, + ] + set_reg.extend([x / 2 for x in set_reg]) + set_reg.extend([x / 3 for x in set_reg]) + params = { - 'alpha': set_reg, - 'l1_ratio' : [0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0], - 'copy_X':[True], - 'normalize' : [False], - 'positive' : [True], - 'fit_intercept' : [True,False] + "alpha": set_reg, + "l1_ratio": [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], + "copy_X": [True], + "normalize": [False], + "positive": [True], + "fit_intercept": [True, False], } - grid = GridSearchCV(model, params,cv=cv,scoring='neg_mean_absolute_error',verbose=0,n_jobs=n_jobs,refit=True) - grid.fit(X,y) - - cv_pred = cv - crossv_mod.set_params(**grid.best_params_) - preds = cross_val_predict(crossv_mod, X=X, y=y, cv=cv_pred, n_jobs=n_jobs, verbose=0) + grid = GridSearchCV( + model, + params, + cv=cv, + scoring="neg_mean_absolute_error", + verbose=0, + n_jobs=n_jobs, + refit=True, + ) + grid.fit(X, y) + crossv_mod.set_params(**grid.best_params_) + ret_mod.set_params(**grid.best_params_) - ret_mod.fit(X,y) + ret_mod.fit(X, y) - coef_indexes = [i for i,coef in enumerate(ret_mod.coef_) if coef > 0.0] - - return ret_mod \ No newline at end of file + return ret_mod diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1adaf18 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,65 @@ +[project] +name = "deeplc" +version = "2.2.27" +description = "DeepLC: Retention time prediction for (modified) peptides using Deep Learning." +readme = "README.md" +license = { file = "LICENSE" } +authors = [ + { name = "Robbin Bouwmeester", email = "robbin.bouwmeester@ugent.be" }, + { name = "Niels Hulstaert" }, + { name = "Arthur Declercq" }, + { name = "Ralf Gabriels" }, + { name = "Lennart Martens" }, + { name = "Sven Degroeve" }, +] +classifiers = [ + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Development Status :: 5 - Production/Stable", +] +requires-python = ">=3.7" +keywords = [ + "DeepLC", + "Proteomics", + "deep learning", + "peptides", + "retention time", + "prediction", +] + +dependencies = [ + "tensorflow>=2.2,<2.13.0", + "numpy>=1.17,<2", + "pandas>=0.25,<2", + "h5py>=2.10.0,<4", + "pygam>=0.8.0,<1", + "deeplcretrainer>=0.1,<1", + "psm_utils>=0.2.3,<1", + "hdf5plugin>=4.1.1", +] + +[project.optional-dependencies] +test = ["pytest", "matplotlib>=3,<4"] +gui = ["gooey>=1.0"] +plot = ["plotly>=5"] +deepcallc = ["scikit-learn<2,>=0.24.0"] + +[project.scripts] +deeplc = "deeplc.__main__:main" +deeplc-gui = "deeplc.gui:start_gui" + +[project.urls] +GitHub = "https://github.com/compomics/deeplc" +PyPi = "https://pypi.org/project/deeplc/" +CompOmics = "https://www.compomics.com" + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = ["deeplc"] +include-package-data = true diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 312e33a..0000000 --- a/setup.cfg +++ /dev/null @@ -1,3 +0,0 @@ -[metadata] -description-file = README.md -license_files = LICENSE diff --git a/setup.py b/setup.py deleted file mode 100644 index ae9cc69..0000000 --- a/setup.py +++ /dev/null @@ -1,65 +0,0 @@ -from setuptools import find_packages, setup - -with open("README.md", "r") as fh: - LONG_DESCRIPTION = fh.read() - - -setup( - name="deeplc", - version="2.2.27", - license="apache-2.0", - description="DeepLC: Retention time prediction for (modified) peptides using Deep Learning.", - long_description=LONG_DESCRIPTION, - long_description_content_type="text/markdown", - author="Robbin Bouwmeester, Niels Hulstaert, Arthur Declercq, Ralf Gabriels, Prof. Lennart Martens, Prof. Sven Degroeve", - author_email="Robbin.Bouwmeester@UGent.be", - url="http://compomics.github.io/projects/DeepLC", - project_urls={ - "Documentation": "http://compomics.github.io/projects/DeepLC", - "Source": "https://github.com/compomics/DeepLC", - "Tracker": "https://github.com/compomics/DeepLC/issues", - }, - packages=find_packages(), - include_package_data=True, - entry_points={ - "console_scripts": [ - "deeplc=deeplc.__main__:main", - "deeplc-gui=deeplc.gui:start_gui", - ] - }, - keywords=[ - "DeepLC", - "Proteomics", - "deep learning", - "peptides", - "retention time", - "prediction", - ], - classifiers=[ - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Topic :: Scientific/Engineering :: Bio-Informatics", - "Development Status :: 4 - Beta", - ], - install_requires=[ - "setuptools>=42.0.1", - "tensorflow>=2.2,<2.13.0", - "scipy>=1.4.1,<2", - "numpy>=1.17,<2", - "pandas>=0.25,<2", - "matplotlib>=3,<4", - "h5py>=2.10.0,<4", - "pygam>=0.8.0,<1", - "scikit-learn>=0.24.0,<2", - "deeplcretrainer>=0.1,<1", - "psm_utils>=0.2.3,<1", - "hdf5plugin>=4.1.1", - ], - extras_require={ - "gui": ["gooey>=1.0"], - "plot": ["plotly>=5"], - }, - python_requires=">=3.7", -) diff --git a/tests/test_deeplc.py b/tests/test_deeplc.py index e88d9b9..4a5af2f 100644 --- a/tests/test_deeplc.py +++ b/tests/test_deeplc.py @@ -1,39 +1,52 @@ "Unit and integration tests for DeepLC." -# Standard library import logging -import pytest import subprocess -# Third party +import numpy as np import pandas as pd -from sklearn.metrics import r2_score +import pytest -# DeepLC import deeplc +def _r2_score(y_true, y_pred): + y_true = np.array(y_true) + y_pred = np.array(y_pred) + numerator = ((y_true - y_pred) ** 2).sum(dtype=np.float64) + denominator = ((y_true - np.average(y_true)) ** 2).sum(dtype=np.float64) + if denominator == 0.0: + return 1.0 + return 1 - numerator / denominator + + def test_cli_basic(): - """ Test command line interface help message. """ - assert subprocess.getstatusoutput('deeplc -h')[0] == 0, "`deeplc -h` \ -returned non-zero status code" + """Test command line interface help message.""" + assert ( + subprocess.getstatusoutput("deeplc -h")[0] == 0 + ), "`deeplc -h` returned non-zero status code" def test_cli_full(): - """" Test command line interface with input files.""" + """ " Test command line interface with input files.""" file_path_pred = "examples/datasets/test_train.csv" file_path_cal = "examples/datasets/test_train.csv" file_path_out = "pytest_cli_out.csv" command = [ - "deeplc", "--file_pred", file_path_pred, "--file_cal", file_path_cal, - "--file_pred_out", file_path_out, + "deeplc", + "--file_pred", + file_path_pred, + "--file_cal", + file_path_cal, + "--file_pred_out", + file_path_out, ] subprocess.run(command, check=True) preds_df = pd.read_csv(file_path_out) train_df = pd.read_csv(file_path_pred) - model_r2 = r2_score(train_df['tr'], preds_df['predicted retention time']) + model_r2 = _r2_score(train_df["tr"], preds_df["predicted retention time"]) logging.info("DeepLC R2 score on %s: %f", file_path_pred, model_r2) assert model_r2 > 0.90, f"DeepLC R2 score on {file_path_pred} below 0.9 \ (was {model_r2})"