From d5a9168b3f86d1fe87841b237e5a710bff3d694b Mon Sep 17 00:00:00 2001 From: RobbinBouwmeester Date: Mon, 8 May 2023 13:38:44 +0200 Subject: [PATCH] Add GUI support for new version --- CHANGELOG.md | 6 +++ deeplc/__main__.py | 101 ++++++++++++++++++++++++------------- deeplc/_argument_parser.py | 30 +++++++---- deeplc/deeplc.py | 40 ++++++--------- setup.py | 2 +- 5 files changed, 108 insertions(+), 71 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a54534..f577e85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +# [2.1.0] - 2023-05-08 + +### Changed +- Support for GUI and PSM utils +- Support for GUI and transfer learning + ## [2.0.4] - 2023-05-06 ### Changed diff --git a/deeplc/__main__.py b/deeplc/__main__.py index 6686c07..ba04e85 100644 --- a/deeplc/__main__.py +++ b/deeplc/__main__.py @@ -18,6 +18,11 @@ from deeplc._argument_parser import parse_arguments from deeplc._exceptions import DeepLCError +from psm_utils.io.peptide_record import peprec_to_proforma +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList +from psm_utils.io import read_file +from psm_utils.io import write_file logger = logging.getLogger(__name__) @@ -72,7 +77,6 @@ def run( file_pred, file_cal=None, file_pred_out=None, - plot_predictions=False, file_model=None, pygam_calibration=True, split_cal=50, @@ -81,6 +85,7 @@ def run( write_library=False, batch_num=50000, n_threads=None, + transfer_learning=False, log_level="info", verbose=True, ): @@ -88,6 +93,11 @@ def run( logger.info("Using DeepLC version %s", __version__) logger.debug("Using %i CPU threads", n_threads) + df_pred = False + df_cal = False + first_line_pred = "" + first_line_cal = "" + if not file_cal and file_model != None: fm_dict = {} sel_group = "" @@ -100,19 +110,49 @@ def run( if m_group == sel_group: fm_dict[m_group] = fm file_model = fm_dict - - # Read input files - df_pred = pd.read_csv(file_pred) - if len(df_pred.columns) < 2: - df_pred = pd.read_csv(file_pred,sep=" ") - df_pred = df_pred.fillna("") - + + with open(file_pred) as f: + first_line_pred = f.readline() if file_cal: + with open(file_cal) as f: + first_line_cal = f.readline() + + if "modifications" in first_line_pred.split(",") and "seq" in first_line_pred.split(","): + # Read input files + df_pred = pd.read_csv(file_pred) + if len(df_pred.columns) < 2: + df_pred = pd.read_csv(file_pred,sep=" ") + df_pred = df_pred.fillna("") + file_pred = "" + + list_of_psms = [] + for seq,mod,ident in zip(df_pred["seq"],df_pred["modifications"],df_pred.index): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident)) + psm_list_pred = PSMList(psm_list=list_of_psms) + df_pred = None + else: + psm_list_pred = read_file(file_pred) + if "msms" in file_pred and ".txt" in file_pred: + mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() + psm_list_pred.rename_modifications(mapper) + + if "modifications" in first_line_cal.split(",") and "seq" in first_line_cal.split(",") and file_cal: df_cal = pd.read_csv(file_cal) if len(df_cal.columns) < 2: df_cal = pd.read_csv(df_cal,sep=" ") df_cal = df_cal.fillna("") - + file_cal = "" + + list_of_psms = [] + for seq,mod,ident,tr in zip(df_cal["seq"],df_cal["modifications"],df_cal.index,df_cal["tr"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + psm_list_cal = PSMList(psm_list=list_of_psms) + df_cal = None + elif file_cal: + psm_list_cal = read_file(file_cal) + if "msms" in file_cal and ".txt" in file_cal: + mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() + psm_list_cal.rename_modifications(mapper) # Make a feature extraction object; you can skip this if you do not want to # use the default settings for DeepLC. Here we want to use a model that does # not use RDKit features so we skip the chemical descriptor making @@ -121,7 +161,7 @@ def run( cnn_feats=True, verbose=verbose ) - + # Make the DeepLC object that will handle making predictions and calibration dlc = DeepLC( path_model=file_model, @@ -134,42 +174,31 @@ def run( batch_num=batch_num, n_jobs=n_threads, verbose=verbose, + deeplc_retrain=transfer_learning ) + # Calibrate the original model based on the new retention times - if file_cal: + if len(psm_list_cal) > 0: logger.info("Selecting best model and calibrating predictions...") - dlc.calibrate_preds(seq_df=df_cal) + print(psm_list_cal) + dlc.calibrate_preds(psm_list=psm_list_cal) # Make predictions; calibrated or uncalibrated logger.info("Making predictions using model: %s", dlc.model) if file_cal: - preds = dlc.make_preds(seq_df=df_pred) + preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred) else: - preds = dlc.make_preds(seq_df=df_pred, calibrate=False) - - df_pred["predicted_tr"] = preds + preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred, calibrate=False) + + #df_pred["predicted_tr"] = preds logger.info("Writing predictions to file: %s", file_pred_out) - df_pred.to_csv(file_pred_out) - - if plot_predictions: - if file_cal and "tr" in df_pred.columns: - file_pred_figure = os.path.splitext(file_pred_out)[0] + '.png' - logger.info( - "Saving scatterplot of predictions to file: %s", - file_pred_figure - ) - plt.figure(figsize=(11.5, 9)) - plt.scatter(df_pred["tr"], df_pred["predicted_tr"], s=3) - plt.title("DeepLC predictions") - plt.xlabel("Observed retention times") - plt.ylabel("Predicted retention times") - plt.savefig(file_pred_figure, dpi=300) - else: - logger.warning( - "No observed retention time in input data. Cannot plot " - "predictions." - ) + + file_pred_out = open(file_pred_out,"w") + file_pred_out.write("Sequence proforma,predicted retention time\n") + for psm,tr in zip(psm_list_pred,preds): + file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n") + file_pred_out.close() logger.info("DeepLC finished!") diff --git a/deeplc/_argument_parser.py b/deeplc/_argument_parser.py index cadbc6b..b8881b4 100644 --- a/deeplc/_argument_parser.py +++ b/deeplc/_argument_parser.py @@ -48,9 +48,9 @@ def parse_arguments(gui=False): "gooey_options": {"checkbox_label": "Use pyGAM calibration"}, "metavar": "Use pyGAM calibration" }, - "legacy_calibration": { - "gooey_options": {"checkbox_label": "Use legacy calibration"}, - "metavar": "Use legacy calibration" + "transfer_learning": { + "gooey_options": {"checkbox_label": "Use transfer learning calibration"}, + "metavar": "Use transfer learning calibration" }, "split_cal": {"gooey_options": {"visible": False}}, "dict_divider": {"gooey_options": {"visible": False}}, @@ -116,13 +116,6 @@ def parse_arguments(gui=False): help="path to write output file with predictions", **gooey_args["file_pred_out"] ) - io_args.add_argument( - "--plot_predictions", - action='store_true', - default=False, - help="save scatter plot of predictions vs observations", - **gooey_args["plot_predictions"] - ) model_cal_args = parser.add_argument_group( "Model and calibration", @@ -153,6 +146,16 @@ def parse_arguments(gui=False): ), **gooey_args["pygam_calibration"] ) + + calibration_group.add_argument( + "--transfer_learning", + dest="transfer_learning", + action="store_false", + help="use transfer learning as calibration method", + **gooey_args["transfer_learning"] + ) + + """ calibration_group.add_argument( "--legacy_calibration", dest="pygam_calibration", @@ -160,6 +163,7 @@ def parse_arguments(gui=False): help="use legacy simple piecewise linear fit as calibration method", **gooey_args["legacy_calibration"] ) + """ model_cal_args.add_argument( "--split_cal", @@ -190,6 +194,7 @@ def parse_arguments(gui=False): advanced_args = parser.add_argument_group( "Advanced configuration", **gooey_args["advanced_args"] ) + """ advanced_args.add_argument( "--use_library", dest="use_library", @@ -202,6 +207,9 @@ def parse_arguments(gui=False): ), **gooey_args["use_library"] ) + """ + + """ advanced_args.add_argument( "--write_library", dest="write_library", @@ -210,6 +218,8 @@ def parse_arguments(gui=False): help="append new predictions to library for faster future results", **gooey_args["write_library"] ) + """ + advanced_args.add_argument( "--batch_num", type=int, diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index 4adb388..21b132f 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -326,11 +326,11 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]): """ list_of_psms = [] if len(charges) > 0: - for seq,mod,id in zip(seqs,mods,identifiers): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id)) + for seq,mod,ident in zip(seqs,mods,identifiers): + list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) else: - for seq,mod,id,z in zip(seqs,mods,identifiers,charges): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id)) + for seq,mod,ident,z in zip(seqs,mods,identifiers,charges): + list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) psm_list = PSMList(psm_list=list_of_psms) @@ -358,11 +358,11 @@ def do_f_extraction_pd(self, list_of_psms = [] if len(charges) == 0: - for seq,mod,id in zip(df_instances["seq"],df_instances["modifications"],df_instances.index): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id)) + for seq,mod,ident in zip(df_instances["seq"],df_instances["modifications"],df_instances.index): + list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) else: - for seq,mod,id,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]): - list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id)) + for seq,mod,ident,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]): + list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident)) psm_list = PSMList(psm_list=list_of_psms) return self.f_extractor.full_feat_extract(psm_list) @@ -680,8 +680,8 @@ def make_preds(self, """ if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] - for seq,mod,id in zip(seq_df["seq"],seq_df["modifications"],seq_df.index): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id)) + for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident)) psm_list = PSMList(psm_list=list_of_psms) if len(infile) > 0: @@ -695,15 +695,7 @@ def make_preds(self, logger.debug("Extracting features for the CNN model ...") X = self.do_f_extraction_psm_list_parallel(psm_list) - #X = self.do_f_extraction_psm_list(psm_list) - X_sum = np.stack(X["matrix_sum"].values()) - #print(np.stack(X["matrix_all"].values())) - #print(X["matrix_all"].values()) - #input("s") - #print(X["pos_matrix"].values()) - #print(np.stack(X["pos_matrix"].values())) - #print("s2") X_global = np.concatenate((np.stack(X["matrix_all"].values()), np.stack(X["pos_matrix"].values())), axis=1) @@ -760,8 +752,8 @@ def calibrate_preds_func_pygam(self, # TODO make sure either psm_list or seq_df is supplied if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] - for seq,mod,id,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id,retention_time=tr)) + for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) psm_list = PSMList(psm_list=list_of_psms) measured_tr = [psm.retention_time for psm in psm_list] @@ -837,8 +829,8 @@ def calibrate_preds_func(self, """ if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] - for seq,mod,id in zip(seq_df["seq"],seq_df["modifications"],seq_df.index): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id)) + for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident)) psm_list = PSMList(psm_list=list_of_psms) predicted_tr = self.make_preds( @@ -982,8 +974,8 @@ def calibrate_preds(self, """ if type(seq_df) == pd.core.frame.DataFrame: list_of_psms = [] - for seq,mod,id,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id,retention_time=tr)) + for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) psm_list = PSMList(psm_list=list_of_psms) diff --git a/setup.py b/setup.py index 4044a2a..c975fb5 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='deeplc', - version='2.0.4', + version='2.1.0', license='apache-2.0', description='DeepLC: Retention time prediction for (modified) peptides using Deep Learning.', long_description=LONG_DESCRIPTION,