From 0363a9bada031b7652badb5f6f6b0008e4a1efeb Mon Sep 17 00:00:00 2001
From: vikramalva <vikram.alva@gmail.com>
Date: Sun, 9 Jul 2023 17:55:42 +0200
Subject: [PATCH] pLM-BLAST: merge output alignments (#1521)

---
 conf/tools.conf                               |  4 +-
 .../lang/extras/modals/toolHelp/plmblast.ts   | 56 ++++++++++++-------
 .../tools/parameters/ParamAccess.scala        |  1 +
 tel/runscripts/plmblast.sh                    |  8 +++
 4 files changed, 47 insertions(+), 22 deletions(-)
diff --git a/conf/tools.conf b/conf/tools.conf
index 26cd1f4af..bfc606a05 100644
--- a/conf/tools.conf
+++ b/conf/tools.conf
@@ -6,7 +6,7 @@
 #Specify hardruntime in seconds
 
 # A new version indicates that the frontend should reload the configuration
-version: "1.0.4"
+version: "1.1.0"
 
 Tools {
 
@@ -1102,7 +1102,7 @@ Tools {
     threads: 16
     hardruntime: 3600
     language: "Python"
-    parameter: ["ALIGNMENT", "PLMBLASTDB", "COSINE_PERCENTILE_CUTOFF", "ALIGNMENT_CUTOFF", "WIN_LEN", "DESC", "PLM_ALN_MODE", "SPAN", "SIGMA_FACTOR"]
+    parameter: ["ALIGNMENT", "PLMBLASTDB", "COSINE_PERCENTILE_CUTOFF", "ALIGNMENT_CUTOFF", "WIN_LEN", "MERGE_HITS", "DESC", "PLM_ALN_MODE", "SPAN", "SIGMA_FACTOR"]
     result_views: [{
       title: "results"
       component: "plmblastResults"
diff --git a/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts b/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts
index bfd20a6ec..3d6a0c7e1 100644
--- a/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts
+++ b/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts
@@ -21,45 +21,61 @@ export default {
                     {
                         title: 'Select target database',
                         content: `<p>Select domain database(s) of template embeddings against which you want to compare the query.</p>
-                    <em>ECOD30, ECOD50, ECOD70</em>
+                    <em>ECOD30, ECOD70</em>
                     <p> These databases are versions of the <a href = http://prodata.swmed.edu/ecod/ target="_blank" rel="noopener">
                         Evolutionary Classification of Protein Domains (ECOD) database</a> filtered for a maximum of 30%,
                         50%, and 70% sequence identity, respectively.</p>`,
                     },
                     {
                         title: 'Cosine similarity percentile cut-off',
-                        content: `A pre-screening procedure is used to improve the performance of database searches. 
-                        First, the database of flattened (per-protein) embeddings is searched using the flattened query
-                         embedding and the cosine similarity metric (this is much faster than the comparison of 
-                         per-residue embeddings), and then the actual pLM-BLAST comparisons are performed only for 
-                         matches above the user-provided cut-off. The cut-off is expressed as the n-th percentile of all 
-                         cosine similarity scores. The higher the pre-screening cut-off, the faster and less sensitive 
-                         the search will be, and vice versa.`,
+                        content: `A pre-screening procedure is used to improve the performance of the database search. 
+                        First, the database of partially flattened embeddings is searched using the partially flattened 
+                        query embedding and the cosine similarity metric, and then the actual pLM-BLAST comparisons are 
+                        performed only for matches above the user-specified cut-off. The cut-off is expressed as the n-th 
+                        percentile of all cosine similarity scores. The higher the pre-screening cut-off, the faster and 
+                        less sensitive the search will be, and vice versa.`,
                     },
                     {
                         title: 'Alignment score cut-off',
-                        content: `Each local alignment is assigned a score calculated as the mean of substitution matrix 
-                        values at coordinates defined by its subpath. The alignment cut-off defines the minimal score for 
-                        reporting a match. The larger the cut-off, the stricter is search. Also, note that only matches 
-                        that passed the pre-filtering step (see "Cosine similarity percentile cut-off") are considered.`,
+                        content: `Each alignment is assigned a score from 0 to 1. The alignment cut-off defines the 
+                        minimum score for reporting a match. The higher the cut-off, the more stringent the search. 
+                        Also note that only matches that have passed the pre-filtering step (see "Cosine Similarity 
+                        Percentile Cutoff") are considered.
+`,
                     },
                     {
                         title: 'Window length',
-                        content: `A moving average is used to identify subpaths, i.e., local alignments, in the full 
-                        paths defined by the traceback procedure. The window size values greater than one tends to 
-                        generate longer local alignments yet may result in reduced sensitivity.`,
+                        content: `A moving average is used to detect local alignments within the full paths determined 
+                        by the traceback procedure. Increasing the window size results in longer local alignments, but 
+                        may decrease sensitivity. This parameter is not used in Global Alignment mode.`,
                     },
                     {
                         title: 'Merge hits',
-                        content: `Since pLM-BLAST tends to return rather short alignments, an optional procedure may be 
-                        applied in which alignments to a single database entry are merged. Such a merged match can 
-                        comprise two or more un-merged hits and its score, similarity, and identity are defined as a 
-                        mean of the values from the individual sub-hits.`,
+                        content: `Since pLM-BLAST tends to return rather short alignments, an optional procedure can be 
+                        used to merge matches to a single database entry. Such a merged match can consist of two or more 
+                        unmerged hits, and its score, similarity, and identity are defined as the average of the values 
+                        from the individual sub-hits.`,
                     },
                     {
                         title: 'Max target hits',
-                        content: `This parameter controls how many matches will be displayed in the results.`,
+                        content: `This parameter controls how many matches are displayed in the results.`,
                     },
+                    {
+                        title: 'Alignment mode',
+                        content: `Specifies whether to return local or global alignments.`,
+                    },
+                    {
+                        title: 'Minimal hit span',
+                        content: `Specifies the minimum length of matches returned.`,
+                    },
+                    {
+                        title: 'Sigma factor',
+                        content: `The sigma factor defines the cutoff at which the background signal is discarded when 
+                        searching for significant local alignments. Increasing (>2) or decreasing (<2) the cutoff makes 
+                        the algorithm stricter or more permissive, respectively. This parameter is not used in Global 
+                        Alignment mode.`,
+                    },
+
                 ],
                 references: `<p>pLM-BLAST – distant homology detection based on direct comparison of sequence 
                                 representations from protein language models.<br>
diff --git a/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala b/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala
index ddb32a375..10e88669f 100644
--- a/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala
+++ b/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala
@@ -93,6 +93,7 @@ class ParamAccess @Inject() (tel: TEL) {
     case "COSINE_PERCENTILE_CUTOFF" => select("cosine_percentile_cutoff", default = Some("95"))
     case "ALIGNMENT_CUTOFF"         => select("alignment_cutoff", default = Some("0.30"))
     case "WIN_LEN"                  => select("win_len", default = Some("10"))
+    case "MERGE_HITS" => select("merge_hits", default = Some("1"))
     case "PLM_ALN_MODE" => select("plm_aln_mode", default = Some("False"))
     case "SPAN" => select("span", default = Some("25"))
     case "SIGMA_FACTOR" => select("sigma_factor", default = Some("2"))
diff --git a/tel/runscripts/plmblast.sh b/tel/runscripts/plmblast.sh
index 78045b40f..21808a1aa 100644
--- a/tel/runscripts/plmblast.sh
+++ b/tel/runscripts/plmblast.sh
@@ -72,6 +72,14 @@ echo "done" >> ../results/process.log
 
 echo "#Preparing output." >> ../results/process.log
 
+if [[ %merge_hits.content = "1" ]] ; then
+# pLM-BLAST tends to yield rather short hits therefore it is beneficial to merge those associated
+# with a single database sequence; additionally, a more strict score cut-off is used
+  python3.9 $PLMBLASTPATH/scripts/merge.py ../results/${JOBID}.hits.csv \
+                                         ../results/${JOBID}.hits_merged.csv
+  mv ../results/${JOBID}.hits_merged.csv ../results/${JOBID}.hits.csv
+fi
+
 python3.9 $PLMBLASTPATH/scripts/csv2nice.py ../results/${JOBID}.hits.csv > ../results/${JOBID}.hits.txt
 
 plmblast_csv_to_json.py ../results/${JOBID}.hits.csv ../results/results.json