From 97dfc040206edae42bce2b88a40511ccb12b2550 Mon Sep 17 00:00:00 2001
From: Vikram Alva <vikram.alva@gmail.com>
Date: Fri, 19 Apr 2024 22:36:32 +0200
Subject: [PATCH] Update pLM-BLAST.

---
 conf/tools.conf                               |  4 +-
 .../lang/extras/modals/toolHelp/plmblast.ts   | 43 ++++++++++++-------
 frontend/src/i18n/lang/modules/tools-en.ts    |  2 +
 .../tools/parameters/ParamAccess.scala        | 10 +++--
 tel/paramspec/PARAMS                          |  2 +
 tel/paramspec/bfactor.prop                    |  3 ++
 tel/paramspec/cosine_percentile_cutoff.prop   |  3 +-
 tel/paramspec/plm_aln_mode.prop               |  4 +-
 tel/paramspec/plm_gap_ext.prop                |  3 ++
 tel/runscripts/plmblast.sh                    | 42 +++++++++++-------
 10 files changed, 76 insertions(+), 40 deletions(-)
 create mode 100644 tel/paramspec/bfactor.prop
 create mode 100644 tel/paramspec/plm_gap_ext.prop

diff --git a/conf/tools.conf b/conf/tools.conf
index e6f2e7524..c08da681e 100644
--- a/conf/tools.conf
+++ b/conf/tools.conf
@@ -6,7 +6,7 @@
 #Specify hardruntime in seconds
 
 # A new version indicates that the frontend should reload the configuration
-version: "1.2.6"
+version: "1.2.7"
 
 Tools {
 
@@ -1102,7 +1102,7 @@ Tools {
     threads: 16
     hardruntime: 3600
     language: "Python"
-    parameter: ["ALIGNMENT", "PLMBLASTDB", "COSINE_PERCENTILE_CUTOFF", "ALIGNMENT_CUTOFF", "WIN_LEN", "MERGE_HITS", "DESC", "PLM_ALN_MODE", "SPAN", "SIGMA_FACTOR"]
+    parameter: ["ALIGNMENT", "PLMBLASTDB", "COSINE_PERCENTILE_CUTOFF", "ALIGNMENT_CUTOFF", "WIN_LEN", "MERGE_HITS", "DESC", "PLM_ALN_MODE", "SPAN", "SIGMA_FACTOR", "BFACTOR","PLM_GAP_EXT"]
     result_views: [{
       title: "results"
       component: "plmblastResults"
diff --git a/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts b/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts
index 3d6a0c7e1..eab647e93 100644
--- a/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts
+++ b/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts
@@ -28,26 +28,25 @@ export default {
                     },
                     {
                         title: 'Cosine similarity percentile cut-off',
-                        content: `A pre-screening procedure is used to improve the performance of the database search. 
-                        First, the database of partially flattened embeddings is searched using the partially flattened 
-                        query embedding and the cosine similarity metric, and then the actual pLM-BLAST comparisons are 
-                        performed only for matches above the user-specified cut-off. The cut-off is expressed as the n-th 
-                        percentile of all cosine similarity scores. The higher the pre-screening cut-off, the faster and 
-                        less sensitive the search will be, and vice versa.`,
+                        content: `A pre-screening procedure can be used to improve the performance of the database search.
+                        First, the database of flattened embeddings is searched using the flattened query embedding and the
+                        cosine similarity metric, and then the actual pLM-BLAST comparisons are performed only for matches
+                        above the user-specified cut-off. The cut-off is expressed as the n-th percentile of all cosine
+                        similarity scores. The higher the pre-screening cut-off, the faster and less sensitive the search
+                        will be, and vice versa.`,
                     },
                     {
                         title: 'Alignment score cut-off',
-                        content: `Each alignment is assigned a score from 0 to 1. The alignment cut-off defines the 
-                        minimum score for reporting a match. The higher the cut-off, the more stringent the search. 
-                        Also note that only matches that have passed the pre-filtering step (see "Cosine Similarity 
-                        Percentile Cutoff") are considered.
-`,
+                        content: `Each query-target alignment is assigned a score from 0 to 1. The alignment cut-off defines
+                        the minimum score for reporting a match. The higher the cut-off, the more stringent the search. Also
+                        note that only matches that have passed the pre-filtering step (see "Cosine Similarity Percentile Cutoff")
+                        are considered.`,
                     },
                     {
                         title: 'Window length',
-                        content: `A moving average is used to detect local alignments within the full paths determined 
-                        by the traceback procedure. Increasing the window size results in longer local alignments, but 
-                        may decrease sensitivity. This parameter is not used in Global Alignment mode.`,
+                        content: `A moving average is used to detect local alignments within the paths determined by the traceback
+                        procedure. Increasing the window size results in longer local alignments, but may decrease sensitivity.
+                        This parameter is not used in Global Alignment mode.`,
                     },
                     {
                         title: 'Merge hits',
@@ -58,7 +57,8 @@ export default {
                     },
                     {
                         title: 'Max target hits',
-                        content: `This parameter controls how many matches are displayed in the results.`,
+                        content: `This parameter controls how many matches are displayed in the results. This filter is
+                        applied after the alignment score cutoff filter.`,
                     },
                     {
                         title: 'Alignment mode',
@@ -66,7 +66,8 @@ export default {
                     },
                     {
                         title: 'Minimal hit span',
-                        content: `Specifies the minimum length of matches returned.`,
+                        content: `Specifies the minimum length of matches returned. The minimum match length cannot be
+                        less than the window length.`,
                     },
                     {
                         title: 'Sigma factor',
@@ -75,6 +76,16 @@ export default {
                         the algorithm stricter or more permissive, respectively. This parameter is not used in Global 
                         Alignment mode.`,
                     },
+                    {
+                        title: 'Bfactor',
+                        content: `Using a Bfactor value greater than 1 will reduce the initial number of alignments by
+                        ignoring alignments that are very close to each other, thus increasing the search speed. This
+                        parameter is not used in Global Alignment mode.`,
+		    },
+                    {
+                        title: 'Gap extension penalty',
+                        content: 'The penalty for extending a gap.',
+	            },
 
                 ],
                 references: `<p>pLM-BLAST – distant homology detection based on direct comparison of sequence 
diff --git a/frontend/src/i18n/lang/modules/tools-en.ts b/frontend/src/i18n/lang/modules/tools-en.ts
index e9673c259..ddda1d4d7 100644
--- a/frontend/src/i18n/lang/modules/tools-en.ts
+++ b/frontend/src/i18n/lang/modules/tools-en.ts
@@ -136,6 +136,8 @@ export default {
             alignment_cutoff: 'Alignment score cut-off',
             win_len: 'Window length',
             merge_hits: 'Merge hits',
+            bfactor: "Bfactor",
+            plm_gap_ext: 'Gap extension penalty',
         },
     },
     inputPlaceholder: {
diff --git a/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala b/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala
index 10e88669f..5d7215acb 100644
--- a/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala
+++ b/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala
@@ -90,13 +90,17 @@ class ParamAccess @Inject() (tel: TEL) {
 
     // pLM-BLAST
     case "PLMBLASTDB"               => select("plmblastdb", default = Some("ECOD30"))
-    case "COSINE_PERCENTILE_CUTOFF" => select("cosine_percentile_cutoff", default = Some("95"))
+    case "COSINE_PERCENTILE_CUTOFF" => select("cosine_percentile_cutoff", default = Some("70"))
     case "ALIGNMENT_CUTOFF"         => select("alignment_cutoff", default = Some("0.30"))
-    case "WIN_LEN"                  => select("win_len", default = Some("10"))
+    case "WIN_LEN"                  => select("win_len", default = Some("15"))
     case "MERGE_HITS" => select("merge_hits", default = Some("1"))
-    case "PLM_ALN_MODE" => select("plm_aln_mode", default = Some("False"))
+    case "PLM_ALN_MODE" => select("plm_aln_mode", default = Some("loc"))
     case "SPAN" => select("span", default = Some("25"))
     case "SIGMA_FACTOR" => select("sigma_factor", default = Some("2"))
+    case "BFACTOR" => select("bfactor", default = Some("2"))
+    case "PLM_GAP_EXT" => select("plm_gap_ext", default = Some("0.5"))
+
+
 
     // HHpred
     case "TWOTEXTALIGNMENT" =>
diff --git a/tel/paramspec/PARAMS b/tel/paramspec/PARAMS
index 93dd77acf..abc04732a 100644
--- a/tel/paramspec/PARAMS
+++ b/tel/paramspec/PARAMS
@@ -67,3 +67,5 @@ diamond_min_aln_cov GEN diamond_min_aln_cov.prop
 plm_aln_mode GEN plm_aln_mode.prop
 span GEN span.prop
 sigma_factor GEN sigma_factor.prop
+plm_gap_ext GEN plm_gap_ext.prop
+bfactor GEN bfactor.prop
diff --git a/tel/paramspec/bfactor.prop b/tel/paramspec/bfactor.prop
new file mode 100644
index 000000000..d2d6b61b2
--- /dev/null
+++ b/tel/paramspec/bfactor.prop
@@ -0,0 +1,3 @@
+1 1
+2 2
+5 5
diff --git a/tel/paramspec/cosine_percentile_cutoff.prop b/tel/paramspec/cosine_percentile_cutoff.prop
index cb3e597dc..345b02b2d 100644
--- a/tel/paramspec/cosine_percentile_cutoff.prop
+++ b/tel/paramspec/cosine_percentile_cutoff.prop
@@ -1,3 +1,4 @@
+70 70
 80 80
 85 85
 90 90
@@ -5,4 +6,4 @@
 96 96
 97 97
 98 98
-99 99
\ No newline at end of file
+99 99
diff --git a/tel/paramspec/plm_aln_mode.prop b/tel/paramspec/plm_aln_mode.prop
index e2470c57d..fa5abfa1c 100644
--- a/tel/paramspec/plm_aln_mode.prop
+++ b/tel/paramspec/plm_aln_mode.prop
@@ -1,2 +1,2 @@
-True Global
-False Local
\ No newline at end of file
+glob Global
+loc Local
diff --git a/tel/paramspec/plm_gap_ext.prop b/tel/paramspec/plm_gap_ext.prop
new file mode 100644
index 000000000..134be7f9f
--- /dev/null
+++ b/tel/paramspec/plm_gap_ext.prop
@@ -0,0 +1,3 @@
+0 0
+0.5 0.5
+1 1
diff --git a/tel/runscripts/plmblast.sh b/tel/runscripts/plmblast.sh
index 21808a1aa..4952e42d8 100644
--- a/tel/runscripts/plmblast.sh
+++ b/tel/runscripts/plmblast.sh
@@ -29,22 +29,23 @@ if [[ ${SEQ_COUNT} = "0" ]] ; then
       fi
 fi
 source ${BIOPROGS}/dependencies/anaconda3/etc/profile.d/conda.sh
-conda activate plmblast
+conda activate plm_blast
+
+set -e
+export HF_HOME=$PLMBLASTPATH/cache
 
 echo "#Calculating embedding for query sequence." >> ../results/process.log
 # calculate index
-python3.9 $PLMBLASTPATH/scripts/makeindex.py ../results/${JOBID}.fas ../results/${JOBID}.csv
+#python $PLMBLASTPATH/scripts/makeindex.py ../results/${JOBID}.fas ../results/${JOBID}.csv
 
 # calculate query embedding
-python3.9 $PLMBLASTPATH/embeddings.py \
+python3.10 $PLMBLASTPATH/embeddings.py start\
           ../results/${JOBID}.fas \
           ../results/${JOBID}.pt
 echo "done" >> ../results/process.log
 
-set -e
-export MKL_NUM_THREADS=1
-export NUMEXPR_NUM_THREADS=1
-export OMP_NUM_THREADS=1
+
+
 
 echo "#Searching %plmblastdb.content." >> ../results/process.log
 
@@ -55,32 +56,41 @@ else
   adjusted_span="%span.content"
 fi
 
-python3.9 $PLMBLASTPATH/scripts/run_plm_blast.py %PLMBLAST/%plmblastdb.content \
+if [[ "%plm_aln_mode.content" -eq "glob"  ]]
+then
+  aln_mode="--global_aln"
+else
+  aln_mode=""
+fi
+
+python3.10 $PLMBLASTPATH/scripts/plmblast.py %PLMBLAST/%plmblastdb.content \
                                              ../results/${JOBID} \
                                              ../results/${JOBID}.hits.csv \
-                                             -cosine_percentile_cutoff %cosine_percentile_cutoff.content \
+                                             -cpc %cosine_percentile_cutoff.content \
                                              -alignment_cutoff %alignment_cutoff.content \
-                                             -max_targets %desc.content \
                                              -workers %THREADS \
                                              -sigma_factor %sigma_factor.content \
-                                             -use_chunks \
                                              -win %win_len.content \
                                              -span ${adjusted_span} \
-                                             --global_aln %plm_aln_mode.content
+                                             -gap_ext %plm_gap_ext.content \
+                                             -bfactor %bfactor.content \
+                                             ${aln_mode}
 
 echo "done" >> ../results/process.log
 
+#-max_targets %desc.content \
+
 echo "#Preparing output." >> ../results/process.log
 
 if [[ %merge_hits.content = "1" ]] ; then
 # pLM-BLAST tends to yield rather short hits therefore it is beneficial to merge those associated
 # with a single database sequence; additionally, a more strict score cut-off is used
-  python3.9 $PLMBLASTPATH/scripts/merge.py ../results/${JOBID}.hits.csv \
-                                         ../results/${JOBID}.hits_merged.csv
+  python3.10 $PLMBLASTPATH/scripts/merge.py ../results/${JOBID}.hits.csv \
+                                         ../results/${JOBID}.hits_merged.csv -max_hits %desc.content
   mv ../results/${JOBID}.hits_merged.csv ../results/${JOBID}.hits.csv
 fi
 
-python3.9 $PLMBLASTPATH/scripts/csv2nice.py ../results/${JOBID}.hits.csv > ../results/${JOBID}.hits.txt
+python3.10 $PLMBLASTPATH/scripts/csv2nice.py ../results/${JOBID}.hits.csv > ../results/${JOBID}.hits.txt
 
 plmblast_csv_to_json.py ../results/${JOBID}.hits.csv ../results/results.json
 
@@ -93,4 +103,4 @@ plmblastviz.pl ${JOBID} ../results/ ../results/
 sed 's/[\.\-]//g' ../results/${JOBID}.fas > ../results/query.fas
 fasta2json.py ../results/query.fas ../results/query.json
 
-echo "done" >> ../results/process.log
\ No newline at end of file
+echo "done" >> ../results/process.log