From d5a9168b3f86d1fe87841b237e5a710bff3d694b Mon Sep 17 00:00:00 2001
From: RobbinBouwmeester <Robbin.bouwmeester@ugent.be>
Date: Mon, 8 May 2023 13:38:44 +0200
Subject: [PATCH] Add GUI support for new version

---
 CHANGELOG.md               |   6 +++
 deeplc/__main__.py         | 101 ++++++++++++++++++++++++-------------
 deeplc/_argument_parser.py |  30 +++++++----
 deeplc/deeplc.py           |  40 ++++++---------
 setup.py                   |   2 +-
 5 files changed, 108 insertions(+), 71 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a54534..f577e85 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to
 [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+# [2.1.0] - 2023-05-08
+
+### Changed
+- Support for GUI and PSM utils
+- Support for GUI and transfer learning
+
 ## [2.0.4] - 2023-05-06
 
 ### Changed
diff --git a/deeplc/__main__.py b/deeplc/__main__.py
index 6686c07..ba04e85 100644
--- a/deeplc/__main__.py
+++ b/deeplc/__main__.py
@@ -18,6 +18,11 @@
 from deeplc._argument_parser import parse_arguments
 from deeplc._exceptions import DeepLCError
 
+from psm_utils.io.peptide_record import peprec_to_proforma
+from psm_utils.psm import PSM
+from psm_utils.psm_list import PSMList
+from psm_utils.io import read_file
+from psm_utils.io import write_file
 
 logger = logging.getLogger(__name__)
 
@@ -72,7 +77,6 @@ def run(
     file_pred,
     file_cal=None,
     file_pred_out=None,
-    plot_predictions=False,
     file_model=None,
     pygam_calibration=True,
     split_cal=50,
@@ -81,6 +85,7 @@ def run(
     write_library=False,
     batch_num=50000,
     n_threads=None,
+    transfer_learning=False,
     log_level="info",
     verbose=True,
 ):
@@ -88,6 +93,11 @@ def run(
     logger.info("Using DeepLC version %s", __version__)
     logger.debug("Using %i CPU threads", n_threads)
 
+    df_pred = False
+    df_cal = False
+    first_line_pred = ""
+    first_line_cal = ""
+
     if not file_cal and file_model != None:
         fm_dict = {}
         sel_group = ""
@@ -100,19 +110,49 @@ def run(
             if m_group == sel_group:
                 fm_dict[m_group] = fm
         file_model = fm_dict
-
-    # Read input files
-    df_pred = pd.read_csv(file_pred)
-    if len(df_pred.columns) < 2:
-        df_pred = pd.read_csv(file_pred,sep=" ")
-    df_pred = df_pred.fillna("")
-
+    
+    with open(file_pred) as f:
+        first_line_pred = f.readline()
     if file_cal:
+        with open(file_cal) as f:
+            first_line_cal = f.readline()
+
+    if "modifications" in first_line_pred.split(",") and "seq" in first_line_pred.split(","):
+        # Read input files
+        df_pred = pd.read_csv(file_pred)
+        if len(df_pred.columns) < 2:
+            df_pred = pd.read_csv(file_pred,sep=" ")
+        df_pred = df_pred.fillna("")
+        file_pred = ""
+
+        list_of_psms = []
+        for seq,mod,ident in zip(df_pred["seq"],df_pred["modifications"],df_pred.index):
+            list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
+        psm_list_pred = PSMList(psm_list=list_of_psms)
+        df_pred = None
+    else:
+        psm_list_pred = read_file(file_pred)
+        if "msms" in file_pred and ".txt" in file_pred:
+            mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
+            psm_list_pred.rename_modifications(mapper)
+
+    if "modifications" in first_line_cal.split(",") and "seq" in first_line_cal.split(",") and file_cal:
         df_cal = pd.read_csv(file_cal)
         if len(df_cal.columns) < 2:
             df_cal = pd.read_csv(df_cal,sep=" ")
         df_cal = df_cal.fillna("")
-
+        file_cal = ""
+
+        list_of_psms = []
+        for seq,mod,ident,tr in zip(df_cal["seq"],df_cal["modifications"],df_cal.index,df_cal["tr"]):
+            list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
+        psm_list_cal = PSMList(psm_list=list_of_psms)
+        df_cal = None
+    elif file_cal:
+        psm_list_cal = read_file(file_cal)
+        if "msms" in file_cal and ".txt" in file_cal:
+            mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
+            psm_list_cal.rename_modifications(mapper)
     # Make a feature extraction object; you can skip this if you do not want to
     # use the default settings for DeepLC. Here we want to use a model that does
     # not use RDKit features so we skip the chemical descriptor making
@@ -121,7 +161,7 @@ def run(
         cnn_feats=True,
         verbose=verbose
     )
-
+    
     # Make the DeepLC object that will handle making predictions and calibration
     dlc = DeepLC(
         path_model=file_model,
@@ -134,42 +174,31 @@ def run(
         batch_num=batch_num,
         n_jobs=n_threads,
         verbose=verbose,
+        deeplc_retrain=transfer_learning
     )
 
+    
     # Calibrate the original model based on the new retention times
-    if file_cal:
+    if len(psm_list_cal) > 0:
         logger.info("Selecting best model and calibrating predictions...")
-        dlc.calibrate_preds(seq_df=df_cal)
+        print(psm_list_cal)
+        dlc.calibrate_preds(psm_list=psm_list_cal)
 
     # Make predictions; calibrated or uncalibrated
     logger.info("Making predictions using model: %s", dlc.model)
     if file_cal:
-        preds = dlc.make_preds(seq_df=df_pred)
+        preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred)
     else:
-        preds = dlc.make_preds(seq_df=df_pred, calibrate=False)
-
-    df_pred["predicted_tr"] = preds
+        preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred, calibrate=False)
+    
+    #df_pred["predicted_tr"] = preds
     logger.info("Writing predictions to file: %s", file_pred_out)
-    df_pred.to_csv(file_pred_out)
-
-    if plot_predictions:
-        if file_cal and "tr" in df_pred.columns:
-            file_pred_figure = os.path.splitext(file_pred_out)[0] + '.png'
-            logger.info(
-                "Saving scatterplot of predictions to file: %s",
-                file_pred_figure
-            )
-            plt.figure(figsize=(11.5, 9))
-            plt.scatter(df_pred["tr"], df_pred["predicted_tr"], s=3)
-            plt.title("DeepLC predictions")
-            plt.xlabel("Observed retention times")
-            plt.ylabel("Predicted retention times")
-            plt.savefig(file_pred_figure, dpi=300)
-        else:
-            logger.warning(
-                "No observed retention time in input data. Cannot plot "
-                "predictions."
-            )
+    
+    file_pred_out = open(file_pred_out,"w")
+    file_pred_out.write("Sequence proforma,predicted retention time\n")
+    for psm,tr in zip(psm_list_pred,preds):
+        file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n")
+    file_pred_out.close()
 
     logger.info("DeepLC finished!")
 
diff --git a/deeplc/_argument_parser.py b/deeplc/_argument_parser.py
index cadbc6b..b8881b4 100644
--- a/deeplc/_argument_parser.py
+++ b/deeplc/_argument_parser.py
@@ -48,9 +48,9 @@ def parse_arguments(gui=False):
             "gooey_options": {"checkbox_label": "Use pyGAM calibration"},
             "metavar": "Use pyGAM calibration"
         },
-        "legacy_calibration": {
-            "gooey_options": {"checkbox_label": "Use legacy calibration"},
-            "metavar": "Use legacy calibration"
+        "transfer_learning": {
+            "gooey_options": {"checkbox_label": "Use transfer learning calibration"},
+            "metavar": "Use transfer learning calibration"
         },
         "split_cal": {"gooey_options": {"visible": False}},
         "dict_divider": {"gooey_options": {"visible": False}},
@@ -116,13 +116,6 @@ def parse_arguments(gui=False):
         help="path to write output file with predictions",
         **gooey_args["file_pred_out"]
     )
-    io_args.add_argument(
-        "--plot_predictions",
-        action='store_true',
-        default=False,
-        help="save scatter plot of predictions vs observations",
-        **gooey_args["plot_predictions"]
-    )
 
     model_cal_args = parser.add_argument_group(
         "Model and calibration",
@@ -153,6 +146,16 @@ def parse_arguments(gui=False):
         ),
         **gooey_args["pygam_calibration"]
     )
+    
+    calibration_group.add_argument(
+        "--transfer_learning",
+        dest="transfer_learning",
+        action="store_false",
+        help="use transfer learning as calibration method",
+        **gooey_args["transfer_learning"]
+    )
+
+    """
     calibration_group.add_argument(
         "--legacy_calibration",
         dest="pygam_calibration",
@@ -160,6 +163,7 @@ def parse_arguments(gui=False):
         help="use legacy simple piecewise linear fit as calibration method",
         **gooey_args["legacy_calibration"]
     )
+    """
 
     model_cal_args.add_argument(
         "--split_cal",
@@ -190,6 +194,7 @@ def parse_arguments(gui=False):
     advanced_args = parser.add_argument_group(
         "Advanced configuration", **gooey_args["advanced_args"]
     )
+    """
     advanced_args.add_argument(
         "--use_library",
         dest="use_library",
@@ -202,6 +207,9 @@ def parse_arguments(gui=False):
         ),
         **gooey_args["use_library"]
     )
+    """
+
+    """
     advanced_args.add_argument(
         "--write_library",
         dest="write_library",
@@ -210,6 +218,8 @@ def parse_arguments(gui=False):
         help="append new predictions to library for faster future results",
         **gooey_args["write_library"]
     )
+    """
+
     advanced_args.add_argument(
         "--batch_num",
         type=int,
diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py
index 4adb388..21b132f 100644
--- a/deeplc/deeplc.py
+++ b/deeplc/deeplc.py
@@ -326,11 +326,11 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]):
         """
         list_of_psms = []
         if len(charges) > 0:
-            for seq,mod,id in zip(seqs,mods,identifiers):
-                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id))
+            for seq,mod,ident in zip(seqs,mods,identifiers):
+                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
         else:
-            for seq,mod,id,z in zip(seqs,mods,identifiers,charges):
-                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id))
+            for seq,mod,ident,z in zip(seqs,mods,identifiers,charges):
+                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
 
         psm_list = PSMList(psm_list=list_of_psms)
 
@@ -358,11 +358,11 @@ def do_f_extraction_pd(self,
 
         list_of_psms = []
         if len(charges) == 0:
-            for seq,mod,id in zip(df_instances["seq"],df_instances["modifications"],df_instances.index):
-                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id))
+            for seq,mod,ident in zip(df_instances["seq"],df_instances["modifications"],df_instances.index):
+                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
         else:
-            for seq,mod,id,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]):
-                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id))
+            for seq,mod,ident,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]):
+                list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
         psm_list = PSMList(psm_list=list_of_psms)
 
         return self.f_extractor.full_feat_extract(psm_list)
@@ -680,8 +680,8 @@ def make_preds(self,
         """
         if type(seq_df) == pd.core.frame.DataFrame:
             list_of_psms = []
-            for seq,mod,id in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
-                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id))
+            for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
+                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
             psm_list = PSMList(psm_list=list_of_psms)
         
         if len(infile) > 0:
@@ -695,15 +695,7 @@ def make_preds(self,
                 logger.debug("Extracting features for the CNN model ...")
 
             X = self.do_f_extraction_psm_list_parallel(psm_list)
-            #X = self.do_f_extraction_psm_list(psm_list)
-
             X_sum = np.stack(X["matrix_sum"].values())
-            #print(np.stack(X["matrix_all"].values()))
-            #print(X["matrix_all"].values())
-            #input("s")
-            #print(X["pos_matrix"].values())
-            #print(np.stack(X["pos_matrix"].values()))
-            #print("s2")
             X_global = np.concatenate((np.stack(X["matrix_all"].values()),
                                     np.stack(X["pos_matrix"].values())),
                                     axis=1)
@@ -760,8 +752,8 @@ def calibrate_preds_func_pygam(self,
         # TODO make sure either psm_list or seq_df is supplied
         if type(seq_df) == pd.core.frame.DataFrame:
             list_of_psms = []
-            for seq,mod,id,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
-                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id,retention_time=tr))
+            for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
+                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
             psm_list = PSMList(psm_list=list_of_psms)
 
             measured_tr = [psm.retention_time for psm in psm_list]
@@ -837,8 +829,8 @@ def calibrate_preds_func(self,
         """
         if type(seq_df) == pd.core.frame.DataFrame:
             list_of_psms = []
-            for seq,mod,id in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
-                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id))
+            for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
+                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
             psm_list = PSMList(psm_list=list_of_psms)
 
         predicted_tr = self.make_preds(
@@ -982,8 +974,8 @@ def calibrate_preds(self,
         """
         if type(seq_df) == pd.core.frame.DataFrame:
             list_of_psms = []
-            for seq,mod,id,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
-                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id,retention_time=tr))
+            for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
+                list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
             psm_list = PSMList(psm_list=list_of_psms)
             
 
diff --git a/setup.py b/setup.py
index 4044a2a..c975fb5 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name='deeplc',
-    version='2.0.4',
+    version='2.1.0',
     license='apache-2.0',
     description='DeepLC: Retention time prediction for (modified) peptides using Deep Learning.',
     long_description=LONG_DESCRIPTION,