-
Notifications
You must be signed in to change notification settings - Fork 0
/
blast_template.txt
executable file
·112 lines (100 loc) · 3.06 KB
/
blast_template.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env bash
#PBS -q omp
#PBS -l mem=@MEM@gb
#PBS -l ncpus=@CORE@
#PBS -l walltime=@WTIME@:00:00
# which version of BLAST do we have to use?
BLAST_VER="2.2.31"
# where do we have to create result files?
WK_DIR=$DATAWORK/blast/${BLAST_VER}
# Type of query: P(rotein) or N(ucleotide) or M(Nucleotide for Megablast)
# Queries below target FASTA formatted sequence files
Q_TYPE=@QTYPE@
case "$Q_TYPE" in
"P" )
QUERY=/home/ref-bioinfo/beedeem/p/Genbank_Yersinia_Angola_prot/current/Genbank_Yersinia_Angola_prot/Genbank_Yersinia_Angola_prot00
;;
"N" | "M" )
QUERY=/home/ref-bioinfo/beedeem/n/Genbank_Yersinia_Angola_CDS/current/Genbank_Yersinia_Angola_CDS/Genbank_Yersinia_Angola_CDS00
;;
*)
echo "ERROR: unknown query type: $Q_TYPE"
exit 1
;;
esac
# Type of subject: P or PS(Protein Small), PL (Protein Large), N or NS (Nucleotide Small) or NL (Nucleotide Large)
S_TYPE=@STYPE@
# Subjects below target BLAST databank alias files (.nal or .pal file)
case "$S_TYPE" in
"P" | "PS")
SUBJECT=/home/ref-bioinfo/beedeem/p/Uniprot_SwissProt/current/Uniprot_SwissProt/Uniprot_SwissProtM
SUBJECT_NAME=swiss
;;
"PL")
SUBJECT=/home/ref-bioinfo/beedeem/p/Uniprot_TrEMBL/current/Uniprot_TrEMBL/Uniprot_TrEMBLM
SUBJECT_NAME=trembl
;;
"N" | "NS")
SUBJECT=/home/ref-bioinfo/beedeem/n/Genbank_Bacteria/current/Genbank_Bacteria/Genbank_BacteriaM
SUBJECT_NAME=gb_bact
;;
"NL")
SUBJECT=/home/ref-bioinfo/beedeem/n/NCBI_nt/current/NCBI_nt/NCBI_ntM
SUBJECT_NAME=nt
;;
*)
echo "ERROR: unknown subject type: $S_TYPE"
exit 1
;;
esac
# BLAST output format
# 5:XML ; 9: binary asn1 ; 6: tabular ; see: https://www.ncbi.nlm.nih.gov/books/NBK279684/ : Table C1
# we use ASN.1 since it can be used to generate other ones using blast_formatter tool
FORMAT=9
case "$FORMAT" in
"5" )
FORMAT_EXT="xml"
;;
"6" )
FORMAT_EXT="tab"
;;
"9" )
FORMAT_EXT="asn1"
;;
*)
FORMAT_EXT="out"
;;
esac
# BLAST command to use
BLAST_CMD="?"
QT=${Q_TYPE:0:1}
ST=${S_TYPE:0:1}
if [ "$QT" == "P" ] && [ "$ST" == "P" ]; then
BLAST_CMD="blastp"
BLAST_CMD_NAME="blastp"
elif [ "$QT" == "N" ] && [ "$ST" == "P" ]; then
BLAST_CMD="blastx"
BLAST_CMD_NAME="blastx"
elif [ "$QT" == "M" ] && [ "$ST" == "N" ]; then
# Megablast
BLAST_CMD="blastn"
BLAST_CMD_NAME="blastn"
elif [ "$QT" == "N" ] && [ "$ST" == "N" ]; then
# blastn
BLAST_CMD="blastn -task blastn"
BLAST_CMD_NAME="megablast"
fi
# Output file name
OUTFNAME=$WK_DIR/${Q_TYPE}-${S_TYPE}-@MEM@gb-@CORE@c
TIMEFNAME=${OUTFNAME}.time
OUTFNAME=${OUTFNAME}.${FORMAT_EXT}
# Prepare a TIME specific format string
export TIME="U=%U;S=%S;E=%E;P=%P;M=%M;K=%K;F=%F;W=%W;I=%I;O=%O"
# Load BLAST Conda environment
. /appli/bioinfo/blast/${BLAST_VER}/env.sh
# Prepare BLAST command
CMD="/usr/bin/time -o $TIMEFNAME --format='$TIME' $BLAST_CMD -query $QUERY -db $SUBJECT -out $OUTFNAME -outfmt $FORMAT -max_target_seqs 1 -evalue 1e-3 -num_threads @CORE@"
# dump cmd to be executed in PDB log for this job
echo $CMD
# eventually, run BLAST!
eval $CMD