From 0363a9bada031b7652badb5f6f6b0008e4a1efeb Mon Sep 17 00:00:00 2001 From: vikramalva Date: Sun, 9 Jul 2023 17:55:42 +0200 Subject: [PATCH] pLM-BLAST: merge output alignments (#1521) --- conf/tools.conf | 4 +- .../lang/extras/modals/toolHelp/plmblast.ts | 56 ++++++++++++------- .../tools/parameters/ParamAccess.scala | 1 + tel/runscripts/plmblast.sh | 8 +++ 4 files changed, 47 insertions(+), 22 deletions(-) diff --git a/conf/tools.conf b/conf/tools.conf index 26cd1f4af..bfc606a05 100644 --- a/conf/tools.conf +++ b/conf/tools.conf @@ -6,7 +6,7 @@ #Specify hardruntime in seconds # A new version indicates that the frontend should reload the configuration -version: "1.0.4" +version: "1.1.0" Tools { @@ -1102,7 +1102,7 @@ Tools { threads: 16 hardruntime: 3600 language: "Python" - parameter: ["ALIGNMENT", "PLMBLASTDB", "COSINE_PERCENTILE_CUTOFF", "ALIGNMENT_CUTOFF", "WIN_LEN", "DESC", "PLM_ALN_MODE", "SPAN", "SIGMA_FACTOR"] + parameter: ["ALIGNMENT", "PLMBLASTDB", "COSINE_PERCENTILE_CUTOFF", "ALIGNMENT_CUTOFF", "WIN_LEN", "MERGE_HITS", "DESC", "PLM_ALN_MODE", "SPAN", "SIGMA_FACTOR"] result_views: [{ title: "results" component: "plmblastResults" diff --git a/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts b/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts index bfd20a6ec..3d6a0c7e1 100644 --- a/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts +++ b/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts @@ -21,45 +21,61 @@ export default { { title: 'Select target database', content: `

Select domain database(s) of template embeddings against which you want to compare the query.

- ECOD30, ECOD50, ECOD70 + ECOD30, ECOD70

These databases are versions of the Evolutionary Classification of Protein Domains (ECOD) database filtered for a maximum of 30%, 50%, and 70% sequence identity, respectively.

`, }, { title: 'Cosine similarity percentile cut-off', - content: `A pre-screening procedure is used to improve the performance of database searches. - First, the database of flattened (per-protein) embeddings is searched using the flattened query - embedding and the cosine similarity metric (this is much faster than the comparison of - per-residue embeddings), and then the actual pLM-BLAST comparisons are performed only for - matches above the user-provided cut-off. The cut-off is expressed as the n-th percentile of all - cosine similarity scores. The higher the pre-screening cut-off, the faster and less sensitive - the search will be, and vice versa.`, + content: `A pre-screening procedure is used to improve the performance of the database search. + First, the database of partially flattened embeddings is searched using the partially flattened + query embedding and the cosine similarity metric, and then the actual pLM-BLAST comparisons are + performed only for matches above the user-specified cut-off. The cut-off is expressed as the n-th + percentile of all cosine similarity scores. The higher the pre-screening cut-off, the faster and + less sensitive the search will be, and vice versa.`, }, { title: 'Alignment score cut-off', - content: `Each local alignment is assigned a score calculated as the mean of substitution matrix - values at coordinates defined by its subpath. The alignment cut-off defines the minimal score for - reporting a match. The larger the cut-off, the stricter is search. Also, note that only matches - that passed the pre-filtering step (see "Cosine similarity percentile cut-off") are considered.`, + content: `Each alignment is assigned a score from 0 to 1. The alignment cut-off defines the + minimum score for reporting a match. The higher the cut-off, the more stringent the search. + Also note that only matches that have passed the pre-filtering step (see "Cosine Similarity + Percentile Cutoff") are considered. +`, }, { title: 'Window length', - content: `A moving average is used to identify subpaths, i.e., local alignments, in the full - paths defined by the traceback procedure. The window size values greater than one tends to - generate longer local alignments yet may result in reduced sensitivity.`, + content: `A moving average is used to detect local alignments within the full paths determined + by the traceback procedure. Increasing the window size results in longer local alignments, but + may decrease sensitivity. This parameter is not used in Global Alignment mode.`, }, { title: 'Merge hits', - content: `Since pLM-BLAST tends to return rather short alignments, an optional procedure may be - applied in which alignments to a single database entry are merged. Such a merged match can - comprise two or more un-merged hits and its score, similarity, and identity are defined as a - mean of the values from the individual sub-hits.`, + content: `Since pLM-BLAST tends to return rather short alignments, an optional procedure can be + used to merge matches to a single database entry. Such a merged match can consist of two or more + unmerged hits, and its score, similarity, and identity are defined as the average of the values + from the individual sub-hits.`, }, { title: 'Max target hits', - content: `This parameter controls how many matches will be displayed in the results.`, + content: `This parameter controls how many matches are displayed in the results.`, }, + { + title: 'Alignment mode', + content: `Specifies whether to return local or global alignments.`, + }, + { + title: 'Minimal hit span', + content: `Specifies the minimum length of matches returned.`, + }, + { + title: 'Sigma factor', + content: `The sigma factor defines the cutoff at which the background signal is discarded when + searching for significant local alignments. Increasing (>2) or decreasing (<2) the cutoff makes + the algorithm stricter or more permissive, respectively. This parameter is not used in Global + Alignment mode.`, + }, + ], references: `

pLM-BLAST – distant homology detection based on direct comparison of sequence representations from protein language models.
diff --git a/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala b/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala index ddb32a375..10e88669f 100644 --- a/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala +++ b/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala @@ -93,6 +93,7 @@ class ParamAccess @Inject() (tel: TEL) { case "COSINE_PERCENTILE_CUTOFF" => select("cosine_percentile_cutoff", default = Some("95")) case "ALIGNMENT_CUTOFF" => select("alignment_cutoff", default = Some("0.30")) case "WIN_LEN" => select("win_len", default = Some("10")) + case "MERGE_HITS" => select("merge_hits", default = Some("1")) case "PLM_ALN_MODE" => select("plm_aln_mode", default = Some("False")) case "SPAN" => select("span", default = Some("25")) case "SIGMA_FACTOR" => select("sigma_factor", default = Some("2")) diff --git a/tel/runscripts/plmblast.sh b/tel/runscripts/plmblast.sh index 78045b40f..21808a1aa 100644 --- a/tel/runscripts/plmblast.sh +++ b/tel/runscripts/plmblast.sh @@ -72,6 +72,14 @@ echo "done" >> ../results/process.log echo "#Preparing output." >> ../results/process.log +if [[ %merge_hits.content = "1" ]] ; then +# pLM-BLAST tends to yield rather short hits therefore it is beneficial to merge those associated +# with a single database sequence; additionally, a more strict score cut-off is used + python3.9 $PLMBLASTPATH/scripts/merge.py ../results/${JOBID}.hits.csv \ + ../results/${JOBID}.hits_merged.csv + mv ../results/${JOBID}.hits_merged.csv ../results/${JOBID}.hits.csv +fi + python3.9 $PLMBLASTPATH/scripts/csv2nice.py ../results/${JOBID}.hits.csv > ../results/${JOBID}.hits.txt plmblast_csv_to_json.py ../results/${JOBID}.hits.csv ../results/results.json