From 97dfc040206edae42bce2b88a40511ccb12b2550 Mon Sep 17 00:00:00 2001 From: Vikram Alva Date: Fri, 19 Apr 2024 22:36:32 +0200 Subject: [PATCH] Update pLM-BLAST. --- conf/tools.conf | 4 +- .../lang/extras/modals/toolHelp/plmblast.ts | 43 ++++++++++++------- frontend/src/i18n/lang/modules/tools-en.ts | 2 + .../tools/parameters/ParamAccess.scala | 10 +++-- tel/paramspec/PARAMS | 2 + tel/paramspec/bfactor.prop | 3 ++ tel/paramspec/cosine_percentile_cutoff.prop | 3 +- tel/paramspec/plm_aln_mode.prop | 4 +- tel/paramspec/plm_gap_ext.prop | 3 ++ tel/runscripts/plmblast.sh | 42 +++++++++++------- 10 files changed, 76 insertions(+), 40 deletions(-) create mode 100644 tel/paramspec/bfactor.prop create mode 100644 tel/paramspec/plm_gap_ext.prop diff --git a/conf/tools.conf b/conf/tools.conf index e6f2e7524..c08da681e 100644 --- a/conf/tools.conf +++ b/conf/tools.conf @@ -6,7 +6,7 @@ #Specify hardruntime in seconds # A new version indicates that the frontend should reload the configuration -version: "1.2.6" +version: "1.2.7" Tools { @@ -1102,7 +1102,7 @@ Tools { threads: 16 hardruntime: 3600 language: "Python" - parameter: ["ALIGNMENT", "PLMBLASTDB", "COSINE_PERCENTILE_CUTOFF", "ALIGNMENT_CUTOFF", "WIN_LEN", "MERGE_HITS", "DESC", "PLM_ALN_MODE", "SPAN", "SIGMA_FACTOR"] + parameter: ["ALIGNMENT", "PLMBLASTDB", "COSINE_PERCENTILE_CUTOFF", "ALIGNMENT_CUTOFF", "WIN_LEN", "MERGE_HITS", "DESC", "PLM_ALN_MODE", "SPAN", "SIGMA_FACTOR", "BFACTOR","PLM_GAP_EXT"] result_views: [{ title: "results" component: "plmblastResults" diff --git a/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts b/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts index 3d6a0c7e1..eab647e93 100644 --- a/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts +++ b/frontend/src/i18n/lang/extras/modals/toolHelp/plmblast.ts @@ -28,26 +28,25 @@ export default { }, { title: 'Cosine similarity percentile cut-off', - content: `A pre-screening procedure is used to improve the performance of the database search. - First, the database of partially flattened embeddings is searched using the partially flattened - query embedding and the cosine similarity metric, and then the actual pLM-BLAST comparisons are - performed only for matches above the user-specified cut-off. The cut-off is expressed as the n-th - percentile of all cosine similarity scores. The higher the pre-screening cut-off, the faster and - less sensitive the search will be, and vice versa.`, + content: `A pre-screening procedure can be used to improve the performance of the database search. + First, the database of flattened embeddings is searched using the flattened query embedding and the + cosine similarity metric, and then the actual pLM-BLAST comparisons are performed only for matches + above the user-specified cut-off. The cut-off is expressed as the n-th percentile of all cosine + similarity scores. The higher the pre-screening cut-off, the faster and less sensitive the search + will be, and vice versa.`, }, { title: 'Alignment score cut-off', - content: `Each alignment is assigned a score from 0 to 1. The alignment cut-off defines the - minimum score for reporting a match. The higher the cut-off, the more stringent the search. - Also note that only matches that have passed the pre-filtering step (see "Cosine Similarity - Percentile Cutoff") are considered. -`, + content: `Each query-target alignment is assigned a score from 0 to 1. The alignment cut-off defines + the minimum score for reporting a match. The higher the cut-off, the more stringent the search. Also + note that only matches that have passed the pre-filtering step (see "Cosine Similarity Percentile Cutoff") + are considered.`, }, { title: 'Window length', - content: `A moving average is used to detect local alignments within the full paths determined - by the traceback procedure. Increasing the window size results in longer local alignments, but - may decrease sensitivity. This parameter is not used in Global Alignment mode.`, + content: `A moving average is used to detect local alignments within the paths determined by the traceback + procedure. Increasing the window size results in longer local alignments, but may decrease sensitivity. + This parameter is not used in Global Alignment mode.`, }, { title: 'Merge hits', @@ -58,7 +57,8 @@ export default { }, { title: 'Max target hits', - content: `This parameter controls how many matches are displayed in the results.`, + content: `This parameter controls how many matches are displayed in the results. This filter is + applied after the alignment score cutoff filter.`, }, { title: 'Alignment mode', @@ -66,7 +66,8 @@ export default { }, { title: 'Minimal hit span', - content: `Specifies the minimum length of matches returned.`, + content: `Specifies the minimum length of matches returned. The minimum match length cannot be + less than the window length.`, }, { title: 'Sigma factor', @@ -75,6 +76,16 @@ export default { the algorithm stricter or more permissive, respectively. This parameter is not used in Global Alignment mode.`, }, + { + title: 'Bfactor', + content: `Using a Bfactor value greater than 1 will reduce the initial number of alignments by + ignoring alignments that are very close to each other, thus increasing the search speed. This + parameter is not used in Global Alignment mode.`, + }, + { + title: 'Gap extension penalty', + content: 'The penalty for extending a gap.', + }, ], references: `

pLM-BLAST – distant homology detection based on direct comparison of sequence diff --git a/frontend/src/i18n/lang/modules/tools-en.ts b/frontend/src/i18n/lang/modules/tools-en.ts index e9673c259..ddda1d4d7 100644 --- a/frontend/src/i18n/lang/modules/tools-en.ts +++ b/frontend/src/i18n/lang/modules/tools-en.ts @@ -136,6 +136,8 @@ export default { alignment_cutoff: 'Alignment score cut-off', win_len: 'Window length', merge_hits: 'Merge hits', + bfactor: "Bfactor", + plm_gap_ext: 'Gap extension penalty', }, }, inputPlaceholder: { diff --git a/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala b/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala index 10e88669f..5d7215acb 100644 --- a/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala +++ b/modules/tools/src/main/scala/de/proteinevolution/tools/parameters/ParamAccess.scala @@ -90,13 +90,17 @@ class ParamAccess @Inject() (tel: TEL) { // pLM-BLAST case "PLMBLASTDB" => select("plmblastdb", default = Some("ECOD30")) - case "COSINE_PERCENTILE_CUTOFF" => select("cosine_percentile_cutoff", default = Some("95")) + case "COSINE_PERCENTILE_CUTOFF" => select("cosine_percentile_cutoff", default = Some("70")) case "ALIGNMENT_CUTOFF" => select("alignment_cutoff", default = Some("0.30")) - case "WIN_LEN" => select("win_len", default = Some("10")) + case "WIN_LEN" => select("win_len", default = Some("15")) case "MERGE_HITS" => select("merge_hits", default = Some("1")) - case "PLM_ALN_MODE" => select("plm_aln_mode", default = Some("False")) + case "PLM_ALN_MODE" => select("plm_aln_mode", default = Some("loc")) case "SPAN" => select("span", default = Some("25")) case "SIGMA_FACTOR" => select("sigma_factor", default = Some("2")) + case "BFACTOR" => select("bfactor", default = Some("2")) + case "PLM_GAP_EXT" => select("plm_gap_ext", default = Some("0.5")) + + // HHpred case "TWOTEXTALIGNMENT" => diff --git a/tel/paramspec/PARAMS b/tel/paramspec/PARAMS index 93dd77acf..abc04732a 100644 --- a/tel/paramspec/PARAMS +++ b/tel/paramspec/PARAMS @@ -67,3 +67,5 @@ diamond_min_aln_cov GEN diamond_min_aln_cov.prop plm_aln_mode GEN plm_aln_mode.prop span GEN span.prop sigma_factor GEN sigma_factor.prop +plm_gap_ext GEN plm_gap_ext.prop +bfactor GEN bfactor.prop diff --git a/tel/paramspec/bfactor.prop b/tel/paramspec/bfactor.prop new file mode 100644 index 000000000..d2d6b61b2 --- /dev/null +++ b/tel/paramspec/bfactor.prop @@ -0,0 +1,3 @@ +1 1 +2 2 +5 5 diff --git a/tel/paramspec/cosine_percentile_cutoff.prop b/tel/paramspec/cosine_percentile_cutoff.prop index cb3e597dc..345b02b2d 100644 --- a/tel/paramspec/cosine_percentile_cutoff.prop +++ b/tel/paramspec/cosine_percentile_cutoff.prop @@ -1,3 +1,4 @@ +70 70 80 80 85 85 90 90 @@ -5,4 +6,4 @@ 96 96 97 97 98 98 -99 99 \ No newline at end of file +99 99 diff --git a/tel/paramspec/plm_aln_mode.prop b/tel/paramspec/plm_aln_mode.prop index e2470c57d..fa5abfa1c 100644 --- a/tel/paramspec/plm_aln_mode.prop +++ b/tel/paramspec/plm_aln_mode.prop @@ -1,2 +1,2 @@ -True Global -False Local \ No newline at end of file +glob Global +loc Local diff --git a/tel/paramspec/plm_gap_ext.prop b/tel/paramspec/plm_gap_ext.prop new file mode 100644 index 000000000..134be7f9f --- /dev/null +++ b/tel/paramspec/plm_gap_ext.prop @@ -0,0 +1,3 @@ +0 0 +0.5 0.5 +1 1 diff --git a/tel/runscripts/plmblast.sh b/tel/runscripts/plmblast.sh index 21808a1aa..4952e42d8 100644 --- a/tel/runscripts/plmblast.sh +++ b/tel/runscripts/plmblast.sh @@ -29,22 +29,23 @@ if [[ ${SEQ_COUNT} = "0" ]] ; then fi fi source ${BIOPROGS}/dependencies/anaconda3/etc/profile.d/conda.sh -conda activate plmblast +conda activate plm_blast + +set -e +export HF_HOME=$PLMBLASTPATH/cache echo "#Calculating embedding for query sequence." >> ../results/process.log # calculate index -python3.9 $PLMBLASTPATH/scripts/makeindex.py ../results/${JOBID}.fas ../results/${JOBID}.csv +#python $PLMBLASTPATH/scripts/makeindex.py ../results/${JOBID}.fas ../results/${JOBID}.csv # calculate query embedding -python3.9 $PLMBLASTPATH/embeddings.py \ +python3.10 $PLMBLASTPATH/embeddings.py start\ ../results/${JOBID}.fas \ ../results/${JOBID}.pt echo "done" >> ../results/process.log -set -e -export MKL_NUM_THREADS=1 -export NUMEXPR_NUM_THREADS=1 -export OMP_NUM_THREADS=1 + + echo "#Searching %plmblastdb.content." >> ../results/process.log @@ -55,32 +56,41 @@ else adjusted_span="%span.content" fi -python3.9 $PLMBLASTPATH/scripts/run_plm_blast.py %PLMBLAST/%plmblastdb.content \ +if [[ "%plm_aln_mode.content" -eq "glob" ]] +then + aln_mode="--global_aln" +else + aln_mode="" +fi + +python3.10 $PLMBLASTPATH/scripts/plmblast.py %PLMBLAST/%plmblastdb.content \ ../results/${JOBID} \ ../results/${JOBID}.hits.csv \ - -cosine_percentile_cutoff %cosine_percentile_cutoff.content \ + -cpc %cosine_percentile_cutoff.content \ -alignment_cutoff %alignment_cutoff.content \ - -max_targets %desc.content \ -workers %THREADS \ -sigma_factor %sigma_factor.content \ - -use_chunks \ -win %win_len.content \ -span ${adjusted_span} \ - --global_aln %plm_aln_mode.content + -gap_ext %plm_gap_ext.content \ + -bfactor %bfactor.content \ + ${aln_mode} echo "done" >> ../results/process.log +#-max_targets %desc.content \ + echo "#Preparing output." >> ../results/process.log if [[ %merge_hits.content = "1" ]] ; then # pLM-BLAST tends to yield rather short hits therefore it is beneficial to merge those associated # with a single database sequence; additionally, a more strict score cut-off is used - python3.9 $PLMBLASTPATH/scripts/merge.py ../results/${JOBID}.hits.csv \ - ../results/${JOBID}.hits_merged.csv + python3.10 $PLMBLASTPATH/scripts/merge.py ../results/${JOBID}.hits.csv \ + ../results/${JOBID}.hits_merged.csv -max_hits %desc.content mv ../results/${JOBID}.hits_merged.csv ../results/${JOBID}.hits.csv fi -python3.9 $PLMBLASTPATH/scripts/csv2nice.py ../results/${JOBID}.hits.csv > ../results/${JOBID}.hits.txt +python3.10 $PLMBLASTPATH/scripts/csv2nice.py ../results/${JOBID}.hits.csv > ../results/${JOBID}.hits.txt plmblast_csv_to_json.py ../results/${JOBID}.hits.csv ../results/results.json @@ -93,4 +103,4 @@ plmblastviz.pl ${JOBID} ../results/ ../results/ sed 's/[\.\-]//g' ../results/${JOBID}.fas > ../results/query.fas fasta2json.py ../results/query.fas ../results/query.json -echo "done" >> ../results/process.log \ No newline at end of file +echo "done" >> ../results/process.log