-
Notifications
You must be signed in to change notification settings - Fork 0
/
v-annotate.pl
executable file
·13875 lines (12812 loc) · 773 KB
/
v-annotate.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
# EPN, Wed May 1 10:18:55 2019 [renamed to v-annotate.pl]
# EPN, Thu Feb 18 12:48:16 2016 [dnaorg_annotate.pl split off from dnaorg_annotate_genomes.pl]
# EPN, Mon Aug 10 10:39:33 2015 [development began on dnaorg_annotate_genomes.pl]
#
use strict;
use warnings;
use Getopt::Long qw(:config no_auto_abbrev);
use Time::HiRes qw(gettimeofday);
use Bio::Easel::MSA;
use Bio::Easel::SqFile;
require "vadr.pm";
require "vadr_seed.pm";
require "sqp_opts.pm";
require "sqp_ofile.pm";
require "sqp_seq.pm";
require "sqp_seqfile.pm";
require "sqp_utils.pm";
#######################################################################################
# What this script does:
#
# Given an input sequence file, each sequence is compared against a
# library of homology models (CMs) and classified and annotated and
# determined to PASS or FAIL. Certain types of unexpected results are
# detected and reported in the output as 'alerts'. There are two
# classes of alerts: those that cause a sequence to FAIL and those
# that do not.
#
# The script proceeds through four main stages:
#
# (1) classification: each sequence is compared against the CM library
# using a relatively fast HMM scoring algorithm, and the highest
# scoring model in the library is defined as the winning CM for
# that sequence.
#
# (2) coverage determination: each sequence is compared against its
# winning model (only) for a second time using a more expensive HMM
# scoring algorithm that is local with respect to the model and
# sequence. This stage allows statistics related to the coverage of
# the sequence and model to be determined, and some alerts can be
# reported based on those statisics.
#
# (3) alignment/annotation: each sequence is aligned to its winning
# model using a still more expensive CM algorithm that takes into
# account secondary structure in the model (if any). This algorithm
# is aligns the full sequence either locally or globally with
# respect to the model. Features are then annotated based on the
# alignment coordinates and the known feature coordinates in the
# model (supplied via the modelinfo file).
#
# (4) protein validation: CDS features are then validated via
# blastx or hmmer by comparing predicted feature spans from (3) to
# pre-computed BLAST or HMMER databases for the model. Alerts can
# be reported based on the blast/hmmer results.
#
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Important options that change this behavior:
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
# Seeded alignment for faster processing (originally developed for
# SARS-CoV-2 sequences), enabled with the -s option:
#
# Stage 1 and 2 are performed by a single blastn search instead of two
# rounds of cmsearch. The top scoring HSP is identified and used
# (after potentially trimming) to 'seed' the alignment of that
# sequence by cmalign. This blastn alignmetn seed region is considered
# fixed and only the sequence before and after it (plus 100nt of
# overlap on each side, controllable with --s_overhang) is aligned
# separately by cmalign. The up to three alignments are then joined to
# get the final alignment.
#
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
# Replacement of stretches of Ns with the -r option:
#
# Stage 1 and 2 are run as pre-processing steps to identify stretches
# of each sequence that are N-rich and replace the Ns in them with the
# expected nucleotide from the best-matching model, where possible.
# After Ns are replaced, the new sequence with Ns replaced is analyzed
# by all four stages a new (so stages 1 and 2 are performed twice, once
# on the input sequences and once on those same sequences but with Ns
# replaced.
#
# By default, stages 1 and 2 are performed with blastn which in
# anecdotal (but not systematic) testing seems less likely then cmsearch
# to extend alignments through stretches of Ns (although this is
# probably controllable to an extent with command line options).
# However, with --r_prof, preprocessing stages 1 and 2 are performed
# with cmsearch.
#
#######################################################################################
#
# Alert identification:
#
# This script identifies and outputs a list of all alerts in each of
# the sequences it annotates. Each type of alert has an associated
# five letter alert 'code'. The list of alert codes is in the
# vadr.pm perl module file in the subroutine: vdr_AlertInfoInitialize().
#
# A table of alerts is output by v-annotate.pl when the --alt_list
# option is used.
#
# List of subroutines in which alerts are detected and added:
# 1. alert_add_ambgnt5s_ambgnt3s()
# ambgnt5s, ambgnt3s (2)
#
# 2. add_classification_alerts()
# noannotn, lowscore, indfclas, qstsbgrp, qstgroup, incsbgrp, incgroup, revcompl, lowcovrg, biasdseq (10)
#
# 3. alert_add_unexdivg()
# unexdivg (1)
#
# 4. parse_stk_and_add_alignment_cds_and_mp_alerts()
# indf5gap, indf5lcc, indf5lcn, indf3gap, indf3lcc, indf3lcn, deletinf, deletins, deletina (9)
#
# 5. fetch_features_and_add_cds_and_mp_alerts_for_one_sequence()
# mutstart, unexleng, mutendcd, mutendex, mutendns, cdsstopn, ambgnt5c, ambgnt3c, ambgnt5f, ambgnt3f (10)
#
# 6. add_protein_validation_alerts()
# indfantn, indfstrp, indf5plg, indf5pst, indf3plg, indf3pst, insertnp, deletinp, cdsstopp, indfantp (10)
#
# 7. alert_add_parent_based()
# peptrans (1)
#
# 8. add_low_similarity_alerts_for_one_sequence()
# lowsim5c, lowsim3c, lowsimic, lowsim5n, lowsim3n, lowsimin, lowsim5s, lowsim3s, lowsimis (9)
#
# 9. add_frameshift_alerts_for_one_sequence()
# fsthicft, fsthicfi, fstloft, fstlocfi, fstukcft, fstukcfi (6)
#
# 10. join_alignments_and_add_unjoinbl_alerts()
# unjoinbl (1)
#
# 12. output_feature_table()
# noftrann, noftrant, ftskipfl (1)
#
#######################################################################################
# make sure required environment variables are set
my $env_vadr_scripts_dir = utl_DirEnvVarValid("VADRSCRIPTSDIR");
my $env_vadr_model_dir = utl_DirEnvVarValid("VADRMODELDIR");
my $env_vadr_blast_dir = utl_DirEnvVarValid("VADRBLASTDIR");
my $env_vadr_infernal_dir = utl_DirEnvVarValid("VADRINFERNALDIR");
my $env_vadr_hmmer_dir = utl_DirEnvVarValid("VADRHMMERDIR");
my $env_vadr_easel_dir = utl_DirEnvVarValid("VADREASELDIR");
my $env_vadr_bioeasel_dir = utl_DirEnvVarValid("VADRBIOEASELDIR");
my $env_vadr_fasta_dir = utl_DirEnvVarValid("VADRFASTADIR");
my %execs_H = (); # hash with paths to all required executables
$execs_H{"cmalign"} = $env_vadr_infernal_dir . "/cmalign";
$execs_H{"cmemit"} = $env_vadr_infernal_dir . "/cmemit";
$execs_H{"cmfetch"} = $env_vadr_infernal_dir . "/cmfetch";
$execs_H{"cmsearch"} = $env_vadr_infernal_dir . "/cmsearch";
$execs_H{"hmmfetch"} = $env_vadr_hmmer_dir . "/hmmfetch";
$execs_H{"hmmscan"} = $env_vadr_hmmer_dir . "/hmmscan";
$execs_H{"hmmsearch"} = $env_vadr_hmmer_dir . "/hmmsearch";
$execs_H{"esl-alimerge"} = $env_vadr_easel_dir . "/esl-alimerge";
$execs_H{"esl-alimanip"} = $env_vadr_easel_dir . "/esl-alimanip";
$execs_H{"esl-reformat"} = $env_vadr_easel_dir . "/esl-reformat";
$execs_H{"esl-seqstat"} = $env_vadr_easel_dir . "/esl-seqstat";
$execs_H{"esl-translate"} = $env_vadr_easel_dir . "/esl-translate";
$execs_H{"esl-ssplit"} = $env_vadr_bioeasel_dir . "/scripts/esl-ssplit.pl";
$execs_H{"blastx"} = $env_vadr_blast_dir . "/blastx";
$execs_H{"blastn"} = $env_vadr_blast_dir . "/blastn";
$execs_H{"makeblastdb"} = $env_vadr_blast_dir . "/makeblastdb";
$execs_H{"parse_blast"} = $env_vadr_scripts_dir . "/parse_blast.pl";
$execs_H{"glsearch"} = $env_vadr_fasta_dir . "/glsearch36";
utl_ExecHValidate(\%execs_H, undef);
#########################################################
# Command line and option processing using sqp_opts.pm
#
# opt_HH: 2D hash:
# 1D key: option name (e.g. "-h")
# 2D key: string denoting type of information
# (one of "type", "default", "group", "requires", "incompatible", "preamble", "help")
# value: string explaining 2D key:
# "type": "boolean", "string", "integer" or "real"
# "default": default value for option
# "group": integer denoting group number this option belongs to
# "requires": string of 0 or more other options this option requires to work, each separated by a ','
# "incompatible": string of 0 or more other options this option is incompatible with, each separated by a ','
# "preamble": string describing option for preamble section (beginning of output from script)
# "help": string describing option for help section (printed if -h used)
# "setby": '1' if option set by user, else 'undef'
# "value": value for option, can be undef if default is undef
#
# opt_order_A: array of options in the order they should be processed
#
# opt_group_desc_H: key: group number (integer), value: description of group for help output
my %opt_HH = ();
my @opt_order_A = ();
my %opt_group_desc_H = ();
my $g = 0; # option group
# Add all options to %opt_HH and @opt_order_A.
# This section needs to be kept in sync (manually) with the &GetOptions call below
# option type default group requires incompat preamble-output help-output
opt_Add("-h", "boolean", 0, 0, undef, undef, undef, "display this help", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "basic options";
# option type default group requires incompat preamble-output help-output
opt_Add("-f", "boolean", 0, $g, undef, undef, "force directory overwrite", "force; if output dir exists, overwrite it", \%opt_HH, \@opt_order_A);
opt_Add("-v", "boolean", 0, $g, undef, undef, "be verbose", "be verbose; output commands to stdout as they're run", \%opt_HH, \@opt_order_A);
opt_Add("--atgonly", "boolean", 0, $g, undef, undef, "only consider ATG a valid start codon", "only consider ATG a valid start codon", \%opt_HH, \@opt_order_A);
opt_Add("--minpvlen", "integer", 30, $g, undef, undef, "min CDS/mat_peptide/gene length for feature table output and protein validation is <n>", "min CDS/mat_peptide/gene length for feature table output and protein validation is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--nkb", "integer", 300, $g, undef, undef, "number of KB of sequence for each alignment job and/or chunk is <n>", "number of KB of sequence for each alignment job and/or chunk is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--keep", "boolean", 0, $g, undef, undef, "leaving intermediate files on disk", "do not remove intermediate files, keep them all on disk", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for specifying classification";
# option type default group requires incompat preamble-output help-output
opt_Add("--group", "string", undef, $g, undef, undef, "set expected classification of all seqs to group <s>", "set expected classification of all seqs to group <s>", \%opt_HH, \@opt_order_A);
opt_Add("--subgroup", "string", undef, $g, "--group", undef, "set expected classification of all seqs to subgroup <s>", "set expected classification of all seqs to subgroup <s>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling severity of alerts";
# option type default group requires incompat preamble-output help-output
opt_Add("--alt_list", "boolean", 0, $g, undef, undef, "output summary of all alerts and exit", "output summary of all alerts and exit", \%opt_HH, \@opt_order_A);
opt_Add("--alt_pass", "string", undef, $g, undef, undef, "specify that alert codes in <s> do not cause FAILure", "specify that alert codes in comma-separated <s> do not cause FAILure", \%opt_HH, \@opt_order_A);
opt_Add("--alt_fail", "string", undef, $g, undef, undef, "specify that alert codes in <s> cause FAILure", "specify that alert codes in comma-separated <s> do cause FAILure", \%opt_HH, \@opt_order_A);
opt_Add("--alt_mnf_yes", "string", undef, $g, undef,"--ignore_mnf", "alert codes in <s> for 'misc_not_failure' features cause misc_feature-ization, not failure", "alert codes in <s> for 'misc_not_failure' features cause misc_feature-ization, not failure", \%opt_HH, \@opt_order_A);
opt_Add("--alt_mnf_no", "string", undef, $g, undef,"--ignore_mnf", "alert codes in <s> for 'misc_not_failure' features cause failure, not misc_feature-ization", "alert codes in <s> for 'misc_not_failure' features cause failure, not misc-feature-ization", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for ignoring specific keys in the input model info (.minfo) file";
# option type default group requires incompat preamble-output help-output
opt_Add("--ignore_mnf", "boolean", 0, $g, undef, undef, "ignore non-zero 'misc_not_failure' values in .minfo file, set to 0 for all features/models", "ignore non-zero 'misc_not_feature' values in .minfo file, set to 0 for all features/models", \%opt_HH, \@opt_order_A);
opt_Add("--ignore_isdel", "boolean", 0, $g, undef, undef, "ignore non-zero 'is_deletable' values in .minfo file, set to 0 for all features/models", "ignore non-zero 'is_deletable' values in .minfo file, set to 0 for all features/models", \%opt_HH, \@opt_order_A);
opt_Add("--ignore_afset", "boolean", 0, $g, undef, undef, "ignore 'alternative_ftr_set' and 'alternative_ftr_set_subn' values in .minfo file", "ignore 'alternative_ftr_set' and 'alternative_ftr_set_subn' values in .minfo file", \%opt_HH, \@opt_order_A);
opt_Add("--ignore_afsetsubn", "boolean", 0, $g, undef, undef, "ignore 'alternative_ftr_set_subn' values in .minfo file", "ignore 'alternative_ftr_set_subn' values in .minfo file", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options related to model files";
# option type default group requires incompat preamble-output help-output
opt_Add("-m", "string", undef, $g, undef, undef, "use CM file <s> instead of default", "use CM file <s> instead of default", \%opt_HH, \@opt_order_A);
opt_Add("-a", "string", undef, $g, "--pv_hmmer",undef, "use protein HMM file <s> instead of default", "use protein HMM file <s> instead of default", \%opt_HH, \@opt_order_A);
opt_Add("-i", "string", undef, $g, undef, undef, "use model info file <s> instead of default", "use model info file <s> instead of default", \%opt_HH, \@opt_order_A);
opt_Add("-n", "string", undef, $g, undef, undef, "use blastn db file <s> instead of default", "use blastn db file <s> instead of default", \%opt_HH, \@opt_order_A);
opt_Add("-x", "string", undef, $g, undef, undef, "blastx dbs are in dir <s>, instead of default", "blastx dbs are in dir <s>, instead of default", \%opt_HH, \@opt_order_A);
opt_Add("--mkey", "string","calici", $g, undef,"-m,-i,-a", ".cm, .minfo, blastn .fa files in \$VADRMODELDIR start with key <s>, not 'vadr'", ".cm, .minfo, blastn .fa files in \$VADRMODELDIR start with key <s>, not 'vadr'", \%opt_HH, \@opt_order_A);
opt_Add("--mdir", "string", undef, $g, undef, undef, "model files are in directory <s>, not in \$VADRMODELDIR", "model files are in directory <s>, not in \$VADRMODELDIR", \%opt_HH, \@opt_order_A);
opt_Add("--mlist", "string", undef, $g, undef, "-s", "only use models listed in file <s>", "only use models listed in file <s>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling output feature table";
# option type default group requires incompat preamble-output help-output
opt_Add("--nomisc", "boolean", 0, $g, undef, undef, "in feature table for failed seqs, never change feature type to misc_feature", "in feature table for failed seqs, never change feature type to misc_feature", \%opt_HH, \@opt_order_A);
opt_Add("--notrim", "boolean", 0, $g, undef, undef, "in feature table, don't trim coords due to ambiguities (for any feature types)", "in feature table, don't trim coords due to ambiguities (for any feature types)", \%opt_HH, \@opt_order_A);
opt_Add("--noftrtrim", "string", undef, $g, undef,"--notrim", "in feature table, don't trim coords due to ambiguities for ftr types in comma-delimited <s>", "in feature table, don't trim coords due to ambiguities for feature types in comma-delmited <s>", \%opt_HH, \@opt_order_A);
opt_Add("--noprotid", "boolean", 0, $g, undef, undef, "in feature table, don't add protein_id for CDS and mat_peptides", "in feature table, don't add protein_id for CDS and mat_peptides", \%opt_HH, \@opt_order_A);
opt_Add("--forceprotid", "boolean", 0, $g, undef,"--noprotid", "in feature table, force protein_id value to be sequence name, then idx", "in feature table, force protein_id value to be sequence name, then idx", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling thresholds related to alerts";
# option type default group requires incompat preamble-output help-output
opt_Add("--lowsc", "real", 0.3, $g, undef, undef, "lowscore/LOW_SCORE bits per nucleotide threshold is <x>", "lowscore/LOW_SCORE bits per nucleotide threshold is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--indefclass", "real", 0.03, $g, undef, undef, "indfclas/INDEFINITE_CLASSIFICATION bits per nucleotide diff threshold is <x>", "indfcls/INDEFINITE_CLASSIFICATION bits per nucleotide diff threshold is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--incspec", "real", 0.2, $g, undef, undef, "inc{group,subgrp}/INCORRECT_{GROUP,SUBGROUP}' bits/nt threshold is <x>", "inc{group,subgrp}/INCORRECT_{GROUP,SUBGROUP} bits/nt threshold is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--lowcov", "real", 0.9, $g, undef, undef, "lowcovrg/LOW_COVERAGE fractional coverage threshold is <x>", "lowcovrg/LOW_COVERAGE fractional coverage threshold is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--dupregolp", "integer", 20, $g, undef, undef, "dupregin/DUPLICATE_REGIONS minimum model overlap is <n>", "dupregin/DUPLICATE_REGIONS minimum model overlap is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--dupregsc", "real", 10, $g, undef, undef, "dupregin/DUPLICATE_REGIONS minimum bit score is <x>", "dupregin/DUPLICATE_REGIONS minimum bit score is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--indefstr", "real", 25, $g, undef, undef, "indfstrn/INDEFINITE_STRAND minimum weaker strand bit score is <x>", "indfstrn/INDEFINITE_STRAND minimum weaker strand bit score is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--lowsim5seq", "integer", 15, $g, undef, undef, "lowsim5s/LOW_SIMILARITY_START minimum length is <n>", "lowsim5s/LOW_SIMILARITY_START minimum length is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--lowsim3seq", "integer", 15, $g, undef, undef, "lowsim3s/LOW_SIMILARITY_END minimum length is <n>", "lowsim3s/LOW_SIMILARITY_END minimum length is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--lowsimiseq", "integer", 1, $g, undef, undef, "lowsimis/LOW_SIMILARITY (internal) minimum length is <n>", "lowsimi/LOW_SIMILARITY (internal) minimum length is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--lowsim5ftr", "integer", 5, $g, undef, undef, "lowsim5{c,n}/LOW_FEATURE_SIMILARITY_START minimum length is <n>", "lowsim5{c,n}/LOW_FEATURE_SIMILARITY_START minimum length is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--lowsim3ftr", "integer", 5, $g, undef, undef, "lowsim3{c,n}/LOW_FEATURE_SIMILARITY_END minimum length is <n>", "lowsim3{c,n}/LOW_FEATURE_SIMILARITY_END minimum length is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--lowsimiftr", "integer", 1, $g, undef, undef, "lowsimi{c,n}/LOW_FEATURE_SIMILARITY (internal) minimum length is <n>", "lowsimi{c,n}/LOW_FEATURE_SIMILARITY (internal) minimum length is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--biasfract", "real", 0.25, $g, undef, undef, "biasdseq/BIASED_SEQUENCE fractional threshold is <x>", "biasdseq/BIASED_SEQUENCE fractional threshold is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--nmiscftrthr","integer", 4, $g, undef, undef, "nmiscftr/TOO_MANY_MISC_FEATURES reported if <n> or more misc_features", "nmiscftr/TOO_MANY_MISC_FEATURES reported if <n> or more misc_features", \%opt_HH, \@opt_order_A);
opt_Add("--indefann", "real", 0.8, $g, undef, undef, "indf{5,3}lc{c,n}/INDEFINITE_ANNOTATION_{START,END} non-mat_peptide min allowed post probability is <x>", "indf{5,3}lc{c,n}/'INDEFINITE_ANNOTATION_{START,END} non-mat_peptide min allowed post probability is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--indefann_mp","real", 0.6, $g, undef, undef, "indf{5,3}lc{c,n}/INDEFINITE_ANNOTATION_{START,END} mat_peptide min allowed post probability is <x>", "indf{5,3}lc{c,n}/'INDEFINITE_ANNOTATION_{START,END} mat_peptide min allowed post probability is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--fstminntt", "integer", 4, $g, undef, undef, "fst{hi,lo,uk}cft/POSSIBLE_FRAMESHIFT{_{HIGH,LOW}_CONF,} max allowed terminal nt length w/o alert is <n>", "fst{hi,lo,uk}cft/POSSIBLE_FRAMESHIFT{_{HIGH,LOW}_CONF,} max allowed terminal nt length w/o alert is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--fstminnti", "integer", 6, $g, undef, undef, "fst{hi,lo,uk}cfi/POSSIBLE_FRAMESHIFT{_{HIGH,LOW}_CONF,} max allowed internal nt length w/o alert is <n>", "fst{hi,lo,uk}cfi/POSSIBLE_FRAMESHIFT{_{HIGH,LOW}_CONF,} max allowed internal nt length w/o alert is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--fsthighthr", "real", 0.8, $g, undef,"--glsearch", "fsthicf{t,i}/POSSIBLE_FRAMESHIFT_HIGH_CONF minimum average probability for alert is <x>", "fsthicf{t,i}/POSSIBLE_FRAMESHIFT_HIGH_CONF minimum average probability for alert is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--fstlowthr", "real", 0.0, $g, undef,"--glsearch", "fstlocf{t,i}/POSSIBLE_FRAMESHIFT_LOW_CONF minimum average probability for alert is <x>", "fstlocf{t,i}/POSSIBLE_FRAMESHIFT_LOW_CONF minimum average probability for alert is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--xalntol", "integer", 5, $g, undef, undef, "indf{5,3}{st,lg}/INDEFINITE_ANNOTATION_{START,END} max allowed nt diff blastx start/end is <n>", "indf{5,3}{st,lg}/INDEFINITE_ANNOTATION_{START,END} max allowed nt diff blastx start/end is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--xmaxins", "integer", 27, $g, undef,"--pv_skip,--pv_hmmer", "insertnp/INSERTION_OF_NT max allowed nucleotide insertion length in blastx validation is <n>", "insertnp/INSERTION_OF_NT max allowed nucleotide insertion length in blastx validation is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--xmaxdel", "integer", 27, $g, undef,"--pv_skip,--pv_hmmer", "deletinp/DELETION_OF_NT max allowed nucleotide deletion length in blastx validation is <n>", "deletinp/DELETION_OF_NT max allowed nucleotide deletion length in blastx validation is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--nmaxins", "integer", 27, $g, undef, undef, "insertnn/INSERTION_OF_NT max allowed nucleotide (nt) insertion length in CDS nt alignment is <n>", "insertnn/INSERTION_OF_NT max allowed nucleotide (nt) insertion length in CDS nt alignment is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--nmaxdel", "integer", 27, $g, undef, undef, "deletinn/DELETION_OF_NT max allowed nucleotide (nt) deletion length in CDS nt alignment is <n>", "deletinn/DELETION_OF_NT max allowed nucleotide (nt) deletion length in CDS nt alignment is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--xlonescore", "integer", 80, $g, undef,"--pv_skip,--pv_hmmer", "indfantp/INDEFINITE_ANNOTATION min score for a blastx hit not supported by CM analysis is <n>", "indfantp/INDEFINITE_ANNOTATION min score for a blastx hit not supported by CM analysis is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--hlonescore", "integer", 10, $g,"--pv_hmmer","--pv_skip", "indfantp/INDEFINITE_ANNOTATION min score for a hmmer hit not supported by CM analysis is <n>", "indfantp/INDEFINITE_ANNOTATION min score for a hmmer hit not supported by CM analysis is <n>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling cmalign alignment stage";
# option type default group requires incompat preamble-output help-output
opt_Add("--mxsize", "integer", 16000, $g, undef,"--glsearch", "set max allowed memory for cmalign to <n> Mb", "set max allowed memory for cmalign to <n> Mb", \%opt_HH, \@opt_order_A);
opt_Add("--tau", "real", 1E-3, $g, undef,"--glsearch", "set the initial tau value for cmalign to <x>", "set the initial tau value for cmalign to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--nofixedtau", "boolean", 0, $g, undef,"--glsearch", "do not fix the tau value when running cmalign, allow it to increase if nec", "do not fix the tau value when running cmalign, allow it to decrease if nec", \%opt_HH, \@opt_order_A);
opt_Add("--nosub", "boolean", 0, $g, undef,"--glsearch", "use alternative alignment strategy for truncated sequences", "use alternative alignment strategy for truncated sequences", \%opt_HH, \@opt_order_A);
opt_Add("--noglocal", "boolean", 0, $g,"--nosub","--glsearch", "do not run cmalign in glocal mode (run in local mode)", "do not run cmalign in glocal mode (run in local mode)", \%opt_HH, \@opt_order_A);
opt_Add("--cmindi", "boolean", 0, $g, undef, "--nkb,--glsearch", "force cmalign to align one seq at a time", "force cmalign to align on seq at a time", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling glsearch alignment stage as alternative to cmalign";
# option type default group requires incompat preamble-output help-output
opt_Add("--glsearch", "boolean", 0, $g,"--glsearch", undef, "align with glsearch from the FASTA package, not to a cm with cmalign", "align with glsearch from the FASTA package, not to a cm with cmalign", \%opt_HH, \@opt_order_A);
opt_Add("--gls_match", "integer", 5, $g,"--glsearch", undef, "set glsearch match score to <n> > 0 with glsearch -r option", "set glsearch match score to <n> > 0 with glsearch -r option", \%opt_HH, \@opt_order_A);
opt_Add("--gls_mismatch", "integer", -3, $g,"--glsearch", undef, "set glsearch mismatch score to <n> < 0 with glsearch -r option", "set glsearch mismatch score to <n> < 0 with glsearch -r option", \%opt_HH, \@opt_order_A);
opt_Add("--gls_gapopen", "integer", -17, $g,"--glsearch", undef, "set glsearch gap open score to <n> < 0 with glsearch -f option", "set glsearch gap open score to <n> < 0 with glsearch -f option", \%opt_HH, \@opt_order_A);
opt_Add("--gls_gapextend","integer", -4, $g,"--glsearch", undef, "set glsearch gap extend score to <n> < 0 with glsearch -g option", "set glsearch gap extend score to <n> < 0 with glsearch -g option", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for controlling blastx protein validation stage";
# option type default group requires incompat preamble-output help-output
opt_Add("--xmatrix", "string", undef, $g, undef,"--pv_skip,--pv_hmmer", "use the matrix <s> with blastx (e.g. BLOSUM45)", "use the matrix <s> with blastx (e.g. BLOSUM45)", \%opt_HH, \@opt_order_A);
opt_Add("--xdrop", "integer", 25, $g, undef,"--pv_skip,--pv_hmmer", "set the xdrop value for blastx to <n>", "set the xdrop value for blastx to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--xnumali", "integer", 20, $g, undef,"--pv_skip,--pv_hmmer", "number of alignments to keep in blastx output and consider if --xlongest is <n>", "number of alignments to keep in blastx output and consider if --xlongest is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--xlongest", "boolean", 0, $g, undef,"--pv_skip,--pv_hmmer", "keep the longest blastx hit, not the highest scoring one", "keep the longest blastx hit, not the highest scoring one", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for using hmmer instead of blastx for protein validation";
# option type default group requires incompat preamble-output help-output
opt_Add("--pv_hmmer", "boolean", 0, $g, undef, "--pv_skip", "use hmmer for protein validation, not blastx", "use hmmer for protein validation, not blastx", \%opt_HH, \@opt_order_A);
opt_Add("--h_max", "boolean", 0, $g, "--pv_hmmer", "--pv_skip", "use --max option with hmmsearch", "use --max option with hmmsearch", \%opt_HH, \@opt_order_A);
opt_Add("--h_minbit", "real", -10, $g, "--pv_hmmer", "--pv_skip", "set minimum hmmsearch bit score threshold to <x>", "set minimum hmmsearch bit score threshold to <x>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options related to blastn-derived seeded alignment acceleration";
# option type default group requires incompat preamble-output help-output
opt_Add("-s", "boolean", 0, $g, undef, undef, "use top-scoring HSP from blastn to seed the alignment", "use top-scoring HSP from blastn to seed the alignment", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastnws", "integer", 7, $g, "-s", undef, "for -s, set blastn -word_size <n> to <n>", "for -s, set blastn -word_size <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastnrw", "integer", 1, $g, "-s", undef, "for -s, set blastn -reward <n> to <n>", "for -s, set blastn -reward <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastnpn", "integer", -2, $g, "-s", undef, "for -s, set blastn -penalty <n> to <n>", "for -s, set blastn -penalty <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastngo", "integer", 2, $g, "-s","--s_blastngdf", "for -s, set blastn -gapopen <n> to <n>", "for -s, set blastn -gapopen <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastnge", "real", 1, $g, "-s","--s_blastngdf", "for -s, set blastn -gapextend <x> to <x>", "for -s, set blastn -gapextend <x> to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastngdf", "boolean", 0, $g, "-s", undef, "for -s, don't use -gapopen/-gapextend w/blastn (use default values)", "for -s, don't use -gapopen/-gapextend w/blastn (use default values)", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastnsc", "real", 50.0, $g, "-s", undef, "for -s, set blastn minimum HSP score to consider to <x>", "for -s, set blastn minimum HSP score to consider to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastntk", "boolean", 0, $g, "-s", undef, "for -s, set blastn option -task blastn", "for -s, set blastn option -task blastn", \%opt_HH, \@opt_order_A);
opt_Add("--s_blastnxd", "integer", 110, $g, "-s", undef, "for -s, set blastn option -xdrop_gap_final <n> to <n>", "for -s, set blastn -xdrop_gap_final <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--s_minsgmlen", "integer", 10, $g, "-s", undef, "for -s, set minimum length of ungapped region in HSP seed to <n>", "for -s, set minimum length of ungapped region in HSP seed to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--s_allsgm", "boolean", 0, $g, "-s", "--s_minsgmlen", "for -s, keep full HSP seed, do not enforce minimum segment length", "for -s, keep full HSP seed, do not enforce minimum segment length", \%opt_HH, \@opt_order_A);
opt_Add("--s_ungapsgm", "boolean", 0, $g, "-s", "--s_minsgmlen,--s_allsgm", "for -s, only keep max length ungapped segment of HSP", "for -s, only keep max length ungapped segment of HSP", \%opt_HH, \@opt_order_A);
opt_Add("--s_startstop", "boolean", 0, $g, "-s", "--s_ungapsgm", "for -s, allow seed to include gaps in start/stop codons", "for -s, allow seed to include gaps in start/stop codons", \%opt_HH, \@opt_order_A);
opt_Add("--s_overhang", "integer", 100, $g, "-s", undef, "for -s, set length of nt overhang for subseqs to align to <n>", "for -s, set length of nt overhang for subseqs to align to <n>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options related to replacing Ns with expected nucleotides";
# option type default group requires incompat preamble-output help-output
opt_Add("-r", "boolean", 0, $g, undef, undef, "replace stretches of Ns with expected nts, where possible", "replace stretches of Ns with expected nts, where possible", \%opt_HH, \@opt_order_A);
opt_Add("--r_minlen", "integer", 5, $g, "-r", undef, "minimum length subsequence to replace Ns in is <n>", "minimum length subsequence to replace Ns in is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--r_minfract5", "real", 0.25, $g, "-r", undef, "minimum fraction of Ns in subseq at 5' end to trigger replacement is <x>", "minimum fraction of Ns in subseq at 5' end to trigger replacement is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--r_minfract3", "real", 0.25, $g, "-r", undef, "minimum fraction of Ns in subseq at 3' end to trigger replacement is <x>", "minimum fraction of Ns in subseq at 3' end to trigger replacement is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--r_minfracti", "real", 0.5, $g, "-r", undef, "minimum fraction of Ns in internal subseq to trigger replacement is <x>", "minimum fraction of Ns in internal subseq to trigger replacement is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--r_diffno", "boolean", 0, $g, "-r", undef, "do not try replacement if seq and mdl regions are of different lengths", "do not try replacement if seq and mdl regions are of different lengths", \%opt_HH, \@opt_order_A);
opt_Add("--r_diffmaxdel", "integer", 10, $g, "-r","--r_diffno","max difference b/t seq/mdl regions (mdllen>seqlen) to try replacement is <n>", "max allowed length difference b/t seq/mdl regions (mdllen>seqlen) to try replacement is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--r_diffmaxins", "integer", 10, $g, "-r","--r_diffno","max difference b/t seq/mdl regions (seqlen>mdllen) to try replacement is <n>", "max allowed length difference b/t seq/mdl regions (seqlen>mdllen) to try replacement is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--r_diffminnonn","integer", 1, $g, "-r","--r_diffno","min number non-Ns in diff len replacement region to try replacement is <n>", "min number non-Ns in diff len replacement region to try replacement is <n>", \%opt_HH, \@opt_order_A);
opt_Add("--r_diffminfract", "real", 0.75, $g, "-r","--r_diffno","min allowed fraction of non-N matches in diff len replacement region is <x>", "min allowed fraction of non-N matches in diff len replacement region is <x>", \%opt_HH, \@opt_order_A);
opt_Add("--r_fetchr", "boolean", 0, $g, "-r", undef, "fetch features for output fastas from seqs w/Ns replaced, not originals", "fetch features for output fastas from seqs w/Ns replaced, not originals", \%opt_HH, \@opt_order_A);
opt_Add("--r_cdsmpr", "boolean", 0, $g, "-r", undef, "detect CDS and MP alerts in sequences w/Ns replaced, not originals", "detect CDS and MP alerts in sequences w/Ns replaced, not originals", \%opt_HH, \@opt_order_A);
opt_Add("--r_pvorig", "boolean", 0, $g, "-r", undef, "use original sequences for protein validation step, not replaced seqs", "use original sequences for protein validation, not replaced seqs", \%opt_HH, \@opt_order_A);
opt_Add("--r_prof", "boolean", 0, $g, "-r", undef, "use slower profile methods, not blastn, to identify Ns to replace", "use slower profile methods, not blastn, to identify Ns to replace", \%opt_HH, \@opt_order_A);
opt_Add("--r_list", "string", undef, $g, "-r", undef, "with -r, only use models listed in file <s> for N replacement stage", "with -r, only use models listed in file <s> for N replacement stage", \%opt_HH, \@opt_order_A);
opt_Add("--r_only", "string", undef, $g, "-r","--r_list","with -r, only use model named <s> for N replacement stage", "with -r, only use model named <s> for N replacement stage", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastnws", "integer", 7, $g, "-r", undef, "for -r, set blastn -word_size <n> to <n>", "for -r, set blastn -word_size <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastnrw", "integer", 1, $g, "-r", undef, "for -r, set blastn -reward <n> to <n>", "for -r, set blastn -reward <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastnpn", "integer", -2, $g, "-r", undef, "for -r, set blastn -penalty <n> to <n>", "for -r, set blastn -penalty <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastngo", "integer", 2, $g, "-r","--r_blastngdf", "for -r, set blastn -gapopen <n> to <n>", "for -r, set blastn -gapopen <n> to <n>", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastnge", "real", 1, $g, "-r","--r_blastngdf", "for -r, set blastn -gapextend <x> to <x>", "for -r, set blastn -gapextend <x> to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastngdf", "boolean", 0, $g, "-r", undef, "for -r, don't use -gapopen/-gapextend w/blastn (use default values)", "for -r, don't use -gapopen/-gapextend w/blastn (use default values)", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastnsc", "real", 50.0, $g, "-r", undef, "for -r, set blastn minimum HSP score to consider to <x>", "for -r, set blastn minimum HSP score to consider to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastntk", "boolean", 0, $g, "-r", undef, "for -r, set blastn option -task blastn", "for -r, set blastn option -task blastn", \%opt_HH, \@opt_order_A);
opt_Add("--r_blastnxd", "integer", 110, $g, "-r", undef, "for -r, set blastn option -xdrop_gap_final <n> to <n>", "for -r, set blastn -xdrop_gap_final <n> to <n>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options related to splitting input file into chunks and processing each chunk separately";
# option type default group requires incompat preamble-output help-output
opt_Add("--split", "boolean", 0, $g, undef, "-p", "split input file into chunks, run each chunk separately", "split input file into chunks, run each chunk separately", \%opt_HH, \@opt_order_A);
opt_Add("--cpu", "integer", 1, $g, undef, undef, "parallelize across <n> CPU workers (requires --split or --glsearch)", "parallelize across <n> CPU workers (requires --split or --glsearch)", \%opt_HH, \@opt_order_A);
opt_Add("--sidx", "integer", 1, $g, undef,"--split", "start sequence indexing at <n> in tabular output files", "start sequence indexing at <n> in tabular output files", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options related to parallelization on compute farm";
# option type default group requires incompat preamble-output help-output
opt_Add("-p", "boolean", 0, $g, undef, undef, "parallelize cmsearch/cmalign on a compute farm", "parallelize cmsearch/cmalign on a compute farm", \%opt_HH, \@opt_order_A);
opt_Add("-q", "string", undef, $g, "-p", undef, "use qsub info file <s> instead of default", "use qsub info file <s> instead of default", \%opt_HH, \@opt_order_A);
opt_Add("--errcheck", "boolean", 0, $g, "-p", undef, "consider any farm stderr output as indicating a job failure", "consider any farm stderr output as indicating a job failure", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options related to splitting input and parallelization on compute farm";
opt_Add("--wait", "integer", 500, $g, undef, undef, "allow <n> minutes for jobs on farm", "allow <n> wall-clock minutes for jobs on farm to finish, including queueing time", \%opt_HH, \@opt_order_A);
opt_Add("--maxnjobs", "integer", 2500, $g, undef, undef, "maximum allowed number of jobs for compute farm", "set max number of jobs to submit to compute farm to <n>", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "options for skipping stages";
# option type default group requires incompat preamble-output help-output
opt_Add("--pv_skip", "boolean", 0, $g, undef, undef, "do not perform blastx-based protein validation", "do not perform blastx-based protein validation", \%opt_HH, \@opt_order_A);
opt_Add("--align_skip", "boolean", 0, $g, undef, "-f", "skip the alignment step, use existing results", "skip the alignment step, use results from an earlier run of the script", \%opt_HH, \@opt_order_A);
opt_Add("--val_only", "boolean", 0, $g, undef, undef, "validate CM and other input files and exit", "validate CM and other input files and exit", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "optional output files";
# option type default group requires incompat preamble-output help-output
opt_Add("--out_stk", "boolean", 0, $g, undef,"--keep", "output per-model full length stockholm alignments (.stk)", "output per-model full length stockholm alignments (.stk)", \%opt_HH, \@opt_order_A);
opt_Add("--out_afa", "boolean", 0, $g, undef,"--keep", "output per-model full length fasta alignments (.afa)", "output per-model full length fasta alignments (.afa)", \%opt_HH, \@opt_order_A);
opt_Add("--out_rpstk", "boolean", 0, $g, "-r","--keep", "with -r, output stockholm alignments of seqs with Ns replaced", "with -r, output stockholm alignments of seqs with Ns replaced", \%opt_HH, \@opt_order_A);
opt_Add("--out_rpafa", "boolean", 0, $g, "-r","--keep", "with -r, output fasta alignments of seqs with Ns replaced", "with -r, output fasta alignments of seqs with Ns replaced", \%opt_HH, \@opt_order_A);
opt_Add("--out_fsstk", "boolean", 0, $g, undef,"--keep", "additionally output frameshift stockholm alignment files", "additionally output frameshift stockholm alignment files", \%opt_HH, \@opt_order_A);
opt_Add("--out_allfasta", "boolean", 0, $g, undef,"--keep", "additionally output fasta files of features", "additionally output fasta files of features", \%opt_HH, \@opt_order_A);
opt_Add("--out_nofasta", "boolean", 0, $g, undef,"--keep,--out_allfasta", "do not output fasta files of passing/failing seqs", "do not output fasta files of passing/failing seqs", \%opt_HH, \@opt_order_A);
opt_Add("--out_debug", "boolean", 0, $g, undef,"--split","dump voluminous info from various data structures to output files", "dump voluminous info from various data structures to output files", \%opt_HH, \@opt_order_A);
$opt_group_desc_H{++$g} = "other expert options";
# option type default group requires incompat preamble-output help-output
opt_Add("--execname", "string", undef, $g, undef, undef, "define executable name of this script as <s>", "define executable name of this script as <s>", \%opt_HH, \@opt_order_A);
opt_Add("--alicheck", "boolean", 0, $g, undef, undef, "for debugging, check aligned sequence vs input sequence for identity", "for debugging, check aligned sequence vs input sequence for identity", \%opt_HH, \@opt_order_A);
opt_Add("--noseqnamemax", "boolean", 0, $g, undef, undef, "do not enforce a maximum length of 50 for sequence names (GenBank max)", "do not enforce a maximum length of 50 for sequence names (GenBank max)", \%opt_HH, \@opt_order_A);
opt_Add("--minbit", "real", -10, $g, undef, undef, "set minimum cmsearch bit score threshold to <x>", "set minimum cmsearch bit score threshold to <x>", \%opt_HH, \@opt_order_A);
opt_Add("--origfa", "boolean", 0, $g, undef, undef, "do not copy fasta file prior to analysis, use original", "do not copy fasta file prior to analysis, use original", \%opt_HH, \@opt_order_A);
opt_Add("--msub", "string", undef, $g, undef, undef, "read model substitution file from <s>", "read model substitution file from <s>", \%opt_HH, \@opt_order_A);
opt_Add("--xsub", "string", undef, $g, undef, undef, "read blastx db substitution file from <s>", "read blastx db substitution file from <s>", \%opt_HH, \@opt_order_A);
opt_Add("--nodcr", "boolean", 0, $g, undef, undef, "do not doctor alignments to shift gaps in start/stop codons", "do not doctor alignments to shift gaps in start/stop codons", \%opt_HH, \@opt_order_A);
opt_Add("--forcedcrins", "boolean", 0, $g,"--cmindi", undef, "force insert type alignment doctoring, requires --cmindi", "force insert type alignment doctoring, requires --cmindi", \%opt_HH, \@opt_order_A);
opt_Add("--xnoid", "boolean", 0, $g, undef,"--pv_hmmer,--pv_skip", "ignore blastx hits that are full length and 100% identical", "ignore blastx hits that are full length and 100% identical", \%opt_HH, \@opt_order_A);
# This section needs to be kept in sync (manually) with the opt_Add() section above
my %GetOptions_H = ();
my $options_okay =
&GetOptions('h' => \$GetOptions_H{"-h"},
# basic options
'f' => \$GetOptions_H{"-f"},
'v' => \$GetOptions_H{"-v"},
'atgonly' => \$GetOptions_H{"--atgonly"},
'minpvlen=s' => \$GetOptions_H{"--minpvlen"},
'nkb=s' => \$GetOptions_H{"--nkb"},
'keep' => \$GetOptions_H{"--keep"},
# options for specifiying classification
'group=s' => \$GetOptions_H{"--group"},
'subgroup=s' => \$GetOptions_H{"--subgroup"},
# options for controlling which alerts cause failure
"alt_list" => \$GetOptions_H{"--alt_list"},
"alt_pass=s" => \$GetOptions_H{"--alt_pass"},
"alt_fail=s" => \$GetOptions_H{"--alt_fail"},
"alt_mnf_yes=s" => \$GetOptions_H{"--alt_mnf_yes"},
"alt_mnf_no=s" => \$GetOptions_H{"--alt_mnf_no"},
"ignore_mnf" => \$GetOptions_H{"--ignore_mnf"},
"ignore_isdel" => \$GetOptions_H{"--ignore_isdel"},
"ignore_afset" => \$GetOptions_H{"--ignore_afset"},
"ignore_afsetsubn" => \$GetOptions_H{"--ignore_afsetsubn"},
# options related to model files
'm=s' => \$GetOptions_H{"-m"},
'a=s' => \$GetOptions_H{"-a"},
'i=s' => \$GetOptions_H{"-i"},
'n=s' => \$GetOptions_H{"-n"},
'x=s' => \$GetOptions_H{"-x"},
'mkey=s' => \$GetOptions_H{"--mkey"},
'mdir=s' => \$GetOptions_H{"--mdir"},
'mlist=s' => \$GetOptions_H{"--mlist"},
# options for controlling output feature tables
"nomisc" => \$GetOptions_H{"--nomisc"},
"notrim" => \$GetOptions_H{"--notrim"},
"noftrtrim=s" => \$GetOptions_H{"--noftrtrim"},
"noprotid" => \$GetOptions_H{"--noprotid"},
"forceprotid" => \$GetOptions_H{"--forceprotid"},
# options for controlling alert thresholds
"lowsc=s" => \$GetOptions_H{"--lowsc"},
'indefclass=s' => \$GetOptions_H{"--indefclass"},
'incspec=s' => \$GetOptions_H{"--incspec"},
"lowcov=s" => \$GetOptions_H{"--lowcov"},
'dupregolp=s' => \$GetOptions_H{"--dupregolp"},
'dupregsc=s' => \$GetOptions_H{"--dupregsc"},
'indefstr=s' => \$GetOptions_H{"--indefstr"},
'lowsim5seq=s' => \$GetOptions_H{"--lowsim5seq"},
'lowsim3seq=s' => \$GetOptions_H{"--lowsim3seq"},
'lowsimiseq=s' => \$GetOptions_H{"--lowsimiseq"},
'lowsim5ftr=s' => \$GetOptions_H{"--lowsim5ftr"},
'lowsim3ftr=s' => \$GetOptions_H{"--lowsim3ftr"},
'lowsimiftr=s' => \$GetOptions_H{"--lowsimiftr"},
'biasfract=s' => \$GetOptions_H{"--biasfract"},
'nmiscftrthr=s' => \$GetOptions_H{"--nmiscftrthr"},
'indefann=s' => \$GetOptions_H{"--indefann"},
'indefann_mp=s' => \$GetOptions_H{"--indefann_mp"},
'fstminntt=s' => \$GetOptions_H{"--fstminntt"},
'fstminnti=s' => \$GetOptions_H{"--fstminnti"},
'fsthighthr=s' => \$GetOptions_H{"--fsthighthr"},
'fstlowthr=s' => \$GetOptions_H{"--fstlowthr"},
'xalntol=s' => \$GetOptions_H{"--xalntol"},
'xmaxins=s' => \$GetOptions_H{"--xmaxins"},
'xmaxdel=s' => \$GetOptions_H{"--xmaxdel"},
'nmaxins=s' => \$GetOptions_H{"--nmaxins"},
'nmaxdel=s' => \$GetOptions_H{"--nmaxdel"},
'xlonescore=s' => \$GetOptions_H{"--xlonescore"},
'hlonescore=s' => \$GetOptions_H{"--hlonescore"},
# options for controlling cmalign alignment stage
'mxsize=s' => \$GetOptions_H{"--mxsize"},
'tau=s' => \$GetOptions_H{"--tau"},
'nofixedtau' => \$GetOptions_H{"--nofixedtau"},
'nosub' => \$GetOptions_H{"--nosub"},
'noglocal' => \$GetOptions_H{"--noglocal"},
'cmindi' => \$GetOptions_H{"--cmindi"},
# options for controlling glsearch alignment stage
'glsearch' => \$GetOptions_H{"--glsearch"},
'gls_match=s' => \$GetOptions_H{"--gls_match"},
'gls_mismatch=s' => \$GetOptions_H{"--gls_mismatch"},
'gls_gapopen=s' => \$GetOptions_H{"--gls_gapopen"},
'gls_gapextend=s'=> \$GetOptions_H{"--gls_gapextend"},
# options for controlling protein blastx protein validation stage
'xmatrix=s' => \$GetOptions_H{"--xmatrix"},
'xdrop=s' => \$GetOptions_H{"--xdrop"},
'xnumali=s' => \$GetOptions_H{"--xnumali"},
'xlongest' => \$GetOptions_H{"--xlongest"},
# options for using hmmer instead of blastx for protein validation
'pv_hmmer' => \$GetOptions_H{"--pv_hmmer"},
'h_max' => \$GetOptions_H{"--h_max"},
'h_minbit=s' => \$GetOptions_H{"--h_minbit"},
# options related to blastn-based acceleration
's' => \$GetOptions_H{"-s"},
's_blastnws=s' => \$GetOptions_H{"--s_blastnws"},
's_blastnrw=s' => \$GetOptions_H{"--s_blastnrw"},
's_blastnpn=s' => \$GetOptions_H{"--s_blastnpn"},
's_blastngo=s' => \$GetOptions_H{"--s_blastngo"},
's_blastnge=s' => \$GetOptions_H{"--s_blastnge"},
's_blastngdf' => \$GetOptions_H{"--s_blastngdf"},
's_blastnsc=s' => \$GetOptions_H{"--s_blastnsc"},
's_blastntk' => \$GetOptions_H{"--s_blastntk"},
's_blastnxd=s' => \$GetOptions_H{"--s_blastnxd"},
's_minsgmlen=s' => \$GetOptions_H{"--s_minsgmlen"},
's_allsgm' => \$GetOptions_H{"--s_allsgm"},
's_ungapsgm' => \$GetOptions_H{"--s_ungapsgm"},
's_startstop' => \$GetOptions_H{"--s_startstop"},
's_overhang=s' => \$GetOptions_H{"--s_overhang"},
# options related to replacing Ns with expected nucleotides
'r' => \$GetOptions_H{"-r"},
'r_minlen=s' => \$GetOptions_H{"--r_minlen"},
'r_minfract5=s' => \$GetOptions_H{"--r_minfract5"},
'r_minfract3=s' => \$GetOptions_H{"--r_minfract3"},
'r_minfracti=s' => \$GetOptions_H{"--r_minfracti"},
'r_diffno' => \$GetOptions_H{"--r_diffno"},
'r_diffmaxdel=s' => \$GetOptions_H{"--r_diffmaxdel"},
'r_diffmaxins=s' => \$GetOptions_H{"--r_diffmaxins"},
'r_diffminnonn=s' => \$GetOptions_H{"--r_diffminnonn"},
'r_diffminfract=s' => \$GetOptions_H{"--r_diffminfract"},
'r_fetchr' => \$GetOptions_H{"--r_fetchr"},
'r_cdsmpr' => \$GetOptions_H{"--r_cdsmpr"},
'r_pvorig' => \$GetOptions_H{"--r_pvorig"},
'r_prof' => \$GetOptions_H{"--r_prof"},
'r_list=s' => \$GetOptions_H{"--r_list"},
'r_only=s' => \$GetOptions_H{"--r_only"},
'r_blastnws=s' => \$GetOptions_H{"--r_blastnws"},
'r_blastnrw=s' => \$GetOptions_H{"--r_blastnrw"},
'r_blastnpn=s' => \$GetOptions_H{"--r_blastnpn"},
'r_blastngo=s' => \$GetOptions_H{"--r_blastngo"},
'r_blastnge=s' => \$GetOptions_H{"--r_blastnge"},
'r_blastngdf' => \$GetOptions_H{"--r_blastngdf"},
'r_blastnsc=s' => \$GetOptions_H{"--r_blastnsc"},
'r_blastntk' => \$GetOptions_H{"--r_blastntk"},
'r_blastnxd=s' => \$GetOptions_H{"--r_blastnxd"},
# options related to splitting
'split' => \$GetOptions_H{"--split"},
'cpu=s' => \$GetOptions_H{"--cpu"},
'sidx=s' => \$GetOptions_H{"--sidx"},
# options related to parallelization
'p' => \$GetOptions_H{"-p"},
'q=s' => \$GetOptions_H{"-q"},
'errcheck' => \$GetOptions_H{"--errcheck"},
# options related to -p or --split
'wait=s' => \$GetOptions_H{"--wait"},
'maxnjobs=s' => \$GetOptions_H{"--maxnjobs"},
# options for skipping stages
'pv_skip' => \$GetOptions_H{"--pv_skip"},
'align_skip' => \$GetOptions_H{"--align_skip"},
'val_only' => \$GetOptions_H{"--val_only"},
# optional output files
'out_stk' => \$GetOptions_H{"--out_stk"},
'out_afa' => \$GetOptions_H{"--out_afa"},
'out_rpstk' => \$GetOptions_H{"--out_rpstk"},
'out_rpafa' => \$GetOptions_H{"--out_rpafa"},
'out_fsstk' => \$GetOptions_H{"--out_fsstk"},
'out_allfasta' => \$GetOptions_H{"--out_allfasta"},
'out_nofasta' => \$GetOptions_H{"--out_nofasta"},
'out_debug' => \$GetOptions_H{"--out_debug"},
# other expert options
'execname=s' => \$GetOptions_H{"--execname"},
'alicheck' => \$GetOptions_H{"--alicheck"},
'noseqnamemax' => \$GetOptions_H{"--noseqnamemax"},
'minbit=s' => \$GetOptions_H{"--minbit"},
'origfa' => \$GetOptions_H{"--origfa"},
'msub=s' => \$GetOptions_H{"--msub"},
'xsub=s' => \$GetOptions_H{"--xsub"},
'nodcr' => \$GetOptions_H{"--nodcr"},
'forcedcrins' => \$GetOptions_H{"--forcedcrins"},
'xnoid' => \$GetOptions_H{"--xnoid"});
my $total_seconds = -1 * ofile_SecondsSinceEpoch(); # by multiplying by -1, we can just add another secondsSinceEpoch call at end to get total time
my $execname_opt = $GetOptions_H{"--execname"};
my $executable = (defined $execname_opt) ? $execname_opt : "v-annotate.pl";
my $usage = "Usage: $executable [-options] <fasta file to annotate> <output directory to create>\n";
my $synopsis = "$executable :: classify and annotate sequences using a model library";
my $date = scalar localtime();
my $version = "1.4.1";
my $releasedate = "Jan 2022";
my $pkgname = "VADR";
# make *STDOUT file handle 'hot' so it automatically flushes whenever we print to it
# it is printed to
select *STDOUT;
$| = 1;
# print help and exit if necessary
if((! $options_okay) || ($GetOptions_H{"-h"})) {
ofile_OutputBanner(*STDOUT, $pkgname, $version, $releasedate, $synopsis, $date, undef);
opt_OutputHelp(*STDOUT, $usage, \%opt_HH, \@opt_order_A, \%opt_group_desc_H);
if(! $options_okay) { die "ERROR, unrecognized option;"; }
else { exit 0; } # -h, exit with 0 status
}
# set options in opt_HH
opt_SetFromUserHash(\%GetOptions_H, \%opt_HH);
# validate options (check for conflicts)
opt_ValidateSet(\%opt_HH, \@opt_order_A);
my $do_keep = opt_Get("--keep", \%opt_HH);
my $do_replace_ns = opt_Get("-r", \%opt_HH);
my $do_nofasta = opt_Get("--out_nofasta", \%opt_HH);
#######################################
# deal with --alt_list option, if used
#######################################
# initialize error related data structures, we have to do this early, so we can deal with --alt_list opition
my %alt_info_HH = ();
vdr_AlertInfoInitialize(\%alt_info_HH, undef);
if(opt_IsUsed("--alt_list",\%opt_HH)) {
alert_list_option(\%alt_info_HH, $pkgname, $version, $releasedate);
exit 0;
}
# check that number of command line args is correct
if(scalar(@ARGV) != 2) {
print "Incorrect number of command line arguments.\n";
print $usage;
print "\nTo see more help on available options, do $executable -h\n\n";
exit(1);
}
my ($orig_in_fa_file, $dir) = (@ARGV);
# enforce that --alt_pass and --alt_fail options are valid
if((opt_IsUsed("--alt_pass", \%opt_HH)) || (opt_IsUsed("--alt_fail", \%opt_HH))) {
alert_pass_fail_options(\%alt_info_HH, \%opt_HH);
}
# enforce that --alt_mnf_yes and --alt_mnf_no options are valid
if((opt_IsUsed("--alt_mnf_yes", \%opt_HH)) || (opt_IsUsed("--alt_mnf_no", \%opt_HH))) {
alert_misc_not_failure_options(\%alt_info_HH, \%opt_HH);
}
# enforce that --fsthighthr and --fstlowthr values make sense
if(opt_Get("--fsthighthr", \%opt_HH) < opt_Get("--fstlowthr", \%opt_HH)) {
if((opt_IsUsed("--fsthighthr", \%opt_HH)) && (opt_IsUsed("--fstlowthr", \%opt_HH))) {
die "ERROR if using --fsthighthr <x1> and --fstlowthr <x2>, <x1> must be > <x2>";
}
elsif(opt_IsUsed("--fsthighthr", \%opt_HH)) {
die "ERROR if using --fsthighthr <x>, <x> must be > " . opt_Get("--fstlowthr", \%opt_HH);
}
elsif(opt_IsUsed("--fstlowthr", \%opt_HH)) {
die "ERROR if using --fstlowthr <x>, <x> must be < " . opt_Get("--fsthighthr", \%opt_HH);
}
else {
die "ERROR, default value for --fsthighthr (" . opt_Get("--fsthighthr", \%opt_HH) . ") is less than default value for --fstlowthr (" . opt_Get("--fstlowthr", \%opt_HH) . ")";
}
}
# check for option requirements that sqp_opts is not sophisticated enough
# to check for:
if(opt_IsUsed("--wait", \%opt_HH)) {
if((! opt_IsUsed("-p", \%opt_HH)) && (! opt_IsUsed("--split", \%opt_HH))) {
die "ERROR, --wait only makes sense in combination with -p or --split";
}
}
if(opt_IsUsed("--maxnjobs", \%opt_HH)) {
if((! opt_IsUsed("-p", \%opt_HH)) && (! opt_IsUsed("--split", \%opt_HH))) {
die "ERROR, --maxnjobs only makes sense in combination with -p or --split";
}
}
if(opt_IsUsed("--cpu", \%opt_HH)) {
if((! opt_IsUsed("--glsearch", \%opt_HH)) && (! opt_IsUsed("--split", \%opt_HH))) {
die "ERROR, --cpu only makes sense in combination with --glsearch or --split";
}
}
# if --split and --out_afa, we require --out_stk
# if --split and --out_rpafa, we require --out_fpstk
# this is because we can't merge afa alignments, only stockholm alignments
# so if we want to merge afa alignments we need the stockholm equivalents
if(opt_IsUsed("--split", \%opt_HH)) {
if((opt_IsUsed("--out_afa", \%opt_HH)) && (! opt_IsUsed("--out_stk", \%opt_HH))) {
die "ERROR, with --split and --out_afa, --out_stk is also required";
}
if((opt_IsUsed("--out_rpafa", \%opt_HH)) && (! opt_IsUsed("--out_rpstk", \%opt_HH))) {
die "ERROR, with --split and --out_rpafa, --out_rpstk is also required";
}
}
# if--nmiscftrthr <n> is used, <n> must be >= 2
if((opt_IsUsed("--nmiscftrthr", \%opt_HH)) && (opt_Get("--nmiscftrthr", \%opt_HH) < 2)) {
die "ERROR, with --nmiscftrthr <n>, <n> must be >= 2";
}
#######################################################
# determine if we are running blastx, hmmer, and blastn
#######################################################
# set defaults, and change if nec
my $do_pv_blastx = 1;
my $do_pv_hmmer = 0;
if(opt_Get("--pv_skip", \%opt_HH)) {
$do_pv_blastx = 0;
$do_pv_hmmer = 0;
}
elsif(opt_Get("--pv_hmmer", \%opt_HH)) {
$do_pv_blastx = 0;
$do_pv_hmmer = 1;
}
my $do_blastn_rpn = (opt_Get("-r", \%opt_HH) && (! opt_Get("--r_prof", \%opt_HH))) ? 1 : 0;
my $do_blastn_cls = opt_Get("-s", \%opt_HH) ? 1 : 0;
my $do_blastn_cdt = opt_Get("-s", \%opt_HH) ? 1 : 0;
my $do_blastn_ali = opt_Get("-s", \%opt_HH) ? 1 : 0;
my $do_blastn_any = ($do_blastn_rpn || $do_blastn_cls || $do_blastn_cdt || $do_blastn_ali) ? 1 : 0;
# we have separate flags for each blastn stage even though
# they are all turned on/off with -s in case future changes
# only need some but not all
my $do_glsearch = opt_Get("--glsearch", \%opt_HH) ? 1 : 0;
#############################
# create the output directory
#############################
my $cmd; # a command to run with utl_RunCommand()
my @early_cmd_A = (); # array of commands we run before our log file is opened
if($dir !~ m/\/$/) { $dir =~ s/\/$//; } # remove final '/' if it exists
if(-d $dir) {
$cmd = "rm -rf $dir";
if(opt_Get("-f", \%opt_HH)) { utl_RunCommand($cmd, opt_Get("-v", \%opt_HH), 0, undef); push(@early_cmd_A, $cmd); }
else { die "ERROR directory named $dir already exists. Remove it, or use -f to overwrite it."; }
}
if(-e $dir) {
$cmd = "rm $dir";
if(opt_Get("-f", \%opt_HH)) { utl_RunCommand($cmd, opt_Get("-v", \%opt_HH), 0, undef); push(@early_cmd_A, $cmd); }
else { die "ERROR a file named $dir already exists. Remove it, or use -f to overwrite it."; }
}
# create the dir
$cmd = "mkdir $dir";
utl_RunCommand($cmd, opt_Get("-v", \%opt_HH), 0, undef);
push(@early_cmd_A, $cmd);
my $dir_tail = $dir;
$dir_tail =~ s/^.+\///; # remove all but last dir
my $out_root = $dir . "/" . $dir_tail . ".vadr";
#############################################
# output program banner and open output files
#############################################
# output preamble
my @arg_desc_A = ("sequence file", "output directory");
my @arg_A = ($orig_in_fa_file, $dir);
my %extra_H = ();
$extra_H{"\$VADRSCRIPTSDIR"} = $env_vadr_scripts_dir;
$extra_H{"\$VADRMODELDIR"} = $env_vadr_model_dir;
$extra_H{"\$VADRINFERNALDIR"} = $env_vadr_infernal_dir;
$extra_H{"\$VADREASELDIR"} = $env_vadr_easel_dir;
$extra_H{"\$VADRBIOEASELDIR"} = $env_vadr_bioeasel_dir;
if($do_pv_blastx || $do_blastn_any) {
$extra_H{"\$VADRBLASTDIR"} = $env_vadr_blast_dir;
}
if($do_glsearch) {
$extra_H{"\$VADRFASTADIR"} = $env_vadr_fasta_dir;
}
ofile_OutputBanner(*STDOUT, $pkgname, $version, $releasedate, $synopsis, $date, \%extra_H);
opt_OutputPreamble(*STDOUT, \@arg_desc_A, \@arg_A, \%opt_HH, \@opt_order_A);
# open the log and command files:
# set output file names and file handles, and open those file handles
my %ofile_info_HH = (); # hash of information on output files we created,
# 1D keys:
# "fullpath": full path to the file
# "nodirpath": file name, full path minus all directories
# "desc": short description of the file
# "FH": file handle to output to for this file, maybe undef
# 2D keys (at least initially)
# "log": log file of what's output to stdout
# "cmd": command file with list of all commands executed
# "list": file with list of all output files created
# open the log and command files
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "log", $out_root . ".log", 1, 1, "Output printed to screen");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "cmd", $out_root . ".cmd", 1, 1, "List of executed commands");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "list", $out_root . ".filelist", 1, 1, "List and description of all output files");
my $log_FH = $ofile_info_HH{"FH"}{"log"};
my $cmd_FH = $ofile_info_HH{"FH"}{"cmd"};
my $FH_HR = $ofile_info_HH{"FH"};
# output files are all open, if we exit after this point, we'll need
# to close these first.
# open optional output files
if(opt_Get("--out_debug", \%opt_HH)) {
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "ftrinfo", $out_root . ".ftrinfo", 1, 1, "per-model feature ftr_info_HAH data (created due to --out_debug)");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "sgminfo", $out_root . ".sgminfo", 1, 1, "per-model segment sgm_info_HAH data (created due to --out_debug)");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "altinfo", $out_root . ".altinfo", 1, 1, "per-alert-code alt_info_HH data (created due to --out_debug)");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "stgresults", $out_root . ".stgresults", 1, 1, "per-sequence stg_results_HHH data (created due to --out_debug)");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "ftrresults", $out_root . ".ftrresults", 1, 1, "per-sequence, per-feature ftr_results_HHAH data (created due to --out_debug)");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "sgmresults", $out_root . ".sgmresults", 1, 1, "per-sequence, per-segment sgm_results_HHAH data (created due to --out_debug)");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "altseqinstances", $out_root . ".altseqinstances", 1, 1, "per-sequence-alert alt_seq_instances_HH data (created due to --out_debug)");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "altftrinstances", $out_root . ".altftrinstances", 1, 1, "per-feature-alert alt_ftr_instances_HHH data (created due to --out_debug)");
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "clsoutput", $out_root . ".clsoutput", 1, 1, "per-sequence cls_output_HH data (created due to --out_debug)");
if(opt_Get("-s", \%opt_HH)) {
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "sdaoutput", $out_root . ".sdaoutput", 1, 1, "per-sequence sda_output_HH data (created due to --out_debug and -s)");
}
if(opt_Get("-r", \%opt_HH)) {
ofile_OpenAndAddFileToOutputInfo(\%ofile_info_HH, "rpnoutput", $out_root . ".rpnoutput", 1, 1, "per-sequence rpn_output_HH data (created due to --out_debug and -r)");
}
}
# now we have the log file open, output the banner there too
ofile_OutputBanner($log_FH, $pkgname, $version, $releasedate, $synopsis, $date, \%extra_H);
opt_OutputPreamble($log_FH, \@arg_desc_A, \@arg_A, \%opt_HH, \@opt_order_A);
# output any commands we already executed to $log_FH
foreach $cmd (@early_cmd_A) {
print $cmd_FH $cmd . "\n";
}
my $progress_w = opt_Get("--split", \%opt_HH) ? 105 : 87; # the width of the left hand column in our progress output, hard-coded
my $start_secs = ofile_OutputProgressPrior("Validating input", $progress_w, $log_FH, *STDOUT);
my @to_remove_A = (); # list of files to remove at end of subroutine, if --keep not used
###########################################
# Validate that we have all the files we need:
# fasta file
utl_FileValidateExistsAndNonEmpty($orig_in_fa_file, "input fasta sequence file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
my $opt_mdir_used = opt_IsUsed("--mdir", \%opt_HH);
my $opt_mkey_used = opt_IsUsed("--mkey", \%opt_HH);
my $opt_mlist_used = opt_IsUsed("--mlist", \%opt_HH);
my $opt_rlist_used = opt_IsUsed("--r_list", \%opt_HH);
my $opt_m_used = opt_IsUsed("-m", \%opt_HH);
my $opt_a_used = opt_IsUsed("-a", \%opt_HH);
my $opt_i_used = opt_IsUsed("-i", \%opt_HH);
my $opt_n_used = opt_IsUsed("-n", \%opt_HH);
my $opt_x_used = opt_IsUsed("-x", \%opt_HH);
my $opt_q_used = opt_IsUsed("-q", \%opt_HH);
my $opt_msub_used = opt_IsUsed("--msub", \%opt_HH);
my $opt_xsub_used = opt_IsUsed("--xsub", \%opt_HH);
my $model_key = opt_Get("--mkey", \%opt_HH); # special case, default value is set in option definition
my $model_dir = ($opt_mdir_used) ? opt_Get("--mdir", \%opt_HH) : $env_vadr_model_dir;
my $model_list = ($opt_mlist_used) ? opt_Get("--mlist", \%opt_HH) : undef;
my $replace_list = ($opt_rlist_used) ? opt_Get("--r_list", \%opt_HH) : undef;
my $cm_file = ($opt_m_used) ? opt_Get("-m", \%opt_HH) : $model_dir . "/" . $model_key . ".cm";
my $hmm_pt_file = ($opt_a_used) ? opt_Get("-a", \%opt_HH) : $model_dir . "/" . $model_key . ".hmm";
my $minfo_file = ($opt_i_used) ? opt_Get("-i", \%opt_HH) : $model_dir . "/" . $model_key . ".minfo";
my $blastn_db_file = ($opt_n_used) ? opt_Get("-n", \%opt_HH) : $model_dir . "/" . $model_key . ".fa";
my $blastx_db_dir = ($opt_x_used) ? opt_Get("-x", \%opt_HH) : $model_dir;
my $qsubinfo_file = ($opt_q_used) ? opt_Get("-q", \%opt_HH) : $env_vadr_scripts_dir . "/vadr.qsubinfo";
my $msub_file = ($opt_msub_used) ? opt_Get("--msub", \%opt_HH) : undef;
my $xsub_file = ($opt_xsub_used) ? opt_Get("--xsub", \%opt_HH) : undef;
my $cm_extra_string = "";
my $pthmm_extra_string = "";
my $minfo_extra_string = "";
my $blastn_extra_string = "";
my $blastx_extra_string = "";
my $qsubinfo_extra_string = "";
if($opt_mdir_used) { $cm_extra_string .= " --mdir"; $pthmm_extra_string .= " --mdir"; $minfo_extra_string .= " --mdir"; $blastn_extra_string .= " --mdir"; }
if($opt_mkey_used) { $cm_extra_string .= " --mkey"; $pthmm_extra_string .= " --mkey"; $minfo_extra_string .= " --mkey"; $blastn_extra_string .= " --mkey"; }
if($opt_m_used) { $cm_extra_string .= " -m"; }
if($opt_a_used) { $pthmm_extra_string .= " -a"; }
if($opt_i_used) { $minfo_extra_string .= " -i"; }
if($opt_n_used) { $blastn_extra_string .= " -n"; }
if($opt_x_used) { $blastx_extra_string .= " -x"; }
if($opt_q_used) { $qsubinfo_extra_string .= " -q"; }
# check for minfo file which we always need
utl_FileValidateExistsAndNonEmpty($minfo_file, sprintf("model info file%s", ($minfo_extra_string eq "") ? "" : ", due to $minfo_extra_string"), undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
# only check for cm file if we need it
if((! $do_glsearch) || (opt_Get("--r_prof", \%opt_HH)) || (opt_Get("--val_only", \%opt_HH))) {
utl_FileValidateExistsAndNonEmpty($cm_file, sprintf("CM file%s", ($cm_extra_string eq "") ? "" : ", due to $cm_extra_string"), undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
for my $sfx (".i1f", ".i1i", ".i1m", ".i1p") {
utl_FileValidateExistsAndNonEmpty($cm_file . $sfx, "cmpress created $sfx file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
}
# cm file must end in .cm, it's how cmalign_or_glsearch*() subroutines
# determine if they should run cmalign or glsearch.
if($cm_file !~ m/\.cm$/) {
ofile_FAIL("ERROR, CM file name must end in '.cm', but $cm_file does not", $cm_file, 1, $FH_HR);
}
}
# only check for blastn db file if we need it
if(($do_blastn_any) || ($do_replace_ns) || ($do_glsearch)) { # we always need this file if $do_replace_ns (-r) because we fetch the consensus model sequence from it
utl_FileValidateExistsAndNonEmpty($blastn_db_file, sprintf("blastn db file%s", ($blastn_extra_string eq "") ? "" : ", due to $blastn_extra_string"), undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
foreach my $sfx (".nhr", ".nin", ".nsq", ".ndb", ".not", ".nto", ".ntf") {
utl_FileValidateExistsAndNonEmpty($blastn_db_file . $sfx, "blastn $sfx file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
}
if(($do_glsearch) || (defined $replace_list) || (opt_IsUsed("--r_only", \%opt_HH))) {
foreach my $sfx (".ssi") { # for fetching seqs from
utl_FileValidateExistsAndNonEmpty($blastn_db_file . $sfx, "easel $sfx file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
}
}
}
# only check for blastx db if we need it
if($do_pv_blastx) {
$blastx_db_dir =~ s/\/$//; # remove trailing '/'
if(! -d $blastx_db_dir) {
ofile_FAIL(sprintf("ERROR, blast db directory $blastx_db_dir%s does not exist", $blastx_extra_string), 1, $FH_HR);
}
}
# only check for protein hmm file if we need it
if($do_pv_hmmer) {
utl_FileValidateExistsAndNonEmpty($hmm_pt_file, sprintf("HMM file%s", ($pthmm_extra_string eq "") ? "" : ", due to $cm_extra_string"), undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
for my $sfx (".h3f", ".h3i", ".h3m", ".h3p") {
utl_FileValidateExistsAndNonEmpty($hmm_pt_file . $sfx, "hmmpress created $sfx file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
}
}
# only check for qsubinfo file if we need it
my ($qsub_prefix, $qsub_suffix) = (undef, undef);
if(opt_IsUsed("-p", \%opt_HH)) {
utl_FileValidateExistsAndNonEmpty($qsubinfo_file, sprintf("qsub info file%s", ($qsubinfo_extra_string eq "") ? "" : ", specified with $qsubinfo_extra_string"), undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
# parse the qsubinfo file
($qsub_prefix, $qsub_suffix) = vdr_ParseQsubFile($qsubinfo_file, $ofile_info_HH{"FH"});
}
# only check for model list file if --mlist used
if(defined $model_list) {
utl_FileValidateExistsAndNonEmpty($model_list, "model list file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
}
# only check for model substitution file if --msub used
if(defined $msub_file) {
utl_FileValidateExistsAndNonEmpty($msub_file, "model substitution file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
}
# only check for blastx db substitution file if --xsub used
if(defined $xsub_file) {
utl_FileValidateExistsAndNonEmpty($xsub_file, "blastx db substitution file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
}
# only check for -r model list file if --r_list used
if(defined $replace_list) {
utl_FileValidateExistsAndNonEmpty($replace_list, "replacement model list file", undef, 1, \%{$ofile_info_HH{"FH"}}); # '1' says: die if it doesn't exist or is empty
}
###########################
# Parse the model info file
###########################
my @mdl_info_AH = (); # array of hashes with model info
my %ftr_info_HAH = (); # hash of array of hashes with feature info
my %sgm_info_HAH = (); # hash of array of hashes with segment info
my @reqd_mdl_keys_A = ("name", "length");
my @reqd_ftr_keys_A = ("type", "coords");
utl_FileValidateExistsAndNonEmpty($minfo_file, "model info file", undef, 1, $FH_HR);
vdr_ModelInfoFileParse($minfo_file, \@reqd_mdl_keys_A, \@reqd_ftr_keys_A, \@mdl_info_AH, \%ftr_info_HAH, $FH_HR);
# validate %mdl_info_AH
my $nmdl = utl_AHValidate(\@mdl_info_AH, \@reqd_mdl_keys_A, "ERROR reading model info from $minfo_file", $FH_HR);
my $mdl_idx;
# verify feature coords make sense and parent_idx_str is valid
for($mdl_idx = 0; $mdl_idx < $nmdl; $mdl_idx++) {
my $mdl_name = $mdl_info_AH[$mdl_idx]{"name"};
vdr_FeatureInfoValidateCoords(\@{$ftr_info_HAH{$mdl_name}}, $mdl_info_AH[$mdl_idx]{"length"}, $FH_HR);
}
# if --group or --subgroup used, make sure at least one model has that group/subgroup
my $exp_group = opt_Get("--group", \%opt_HH);
my $exp_subgroup = opt_Get("--subgroup", \%opt_HH);
if(opt_IsUsed("--group", \%opt_HH)) {
if(utl_AHCountKeyValue(\@mdl_info_AH, "group", $exp_group) == 0) {
ofile_FAIL("ERROR with --group $exp_group, did not read any models with group defined as $exp_group in model info file:\n$minfo_file", 1, $FH_HR);
}
}
if(opt_IsUsed("--subgroup", \%opt_HH)) {
if(! defined $exp_group) {
# opt_ValidateSet() will have enforced --subgroup requires --group, but we check again
ofile_FAIL("ERROR with --subgroup, the --group option must also be used", 1, $FH_HR);
}
if(utl_AHCountKeyValue(\@mdl_info_AH, "subgroup", $exp_subgroup) == 0) {
ofile_FAIL("ERROR with --group $exp_group and --subgroup $exp_subgroup,\ndid not read any models with group defined as $exp_group and subgroup defined as $exp_subgroup in model info file:\n$minfo_file", 1, $FH_HR);
}
}
# if --mlist used ($model_list will be defined) validate all models listed
# in $model_list are in model info file
if(defined $model_list) {
my $err_msg = "";
my %mdl_name_H = (); # key is a model name, value is 1
for($mdl_idx = 0; $mdl_idx < $nmdl; $mdl_idx++) {
$mdl_name_H{$mdl_info_AH[$mdl_idx]{"name"}} = 1;
}
my @mdl_name_list_A = (); # all models read from $model_list file
utl_FileLinesToArray($model_list, 1, \@mdl_name_list_A, $FH_HR);
foreach my $mdl (@mdl_name_list_A) {
if(! defined $mdl_name_H{$mdl}) {
$err_msg .= "$mdl\n";
}
}
if($err_msg ne "") {
ofile_FAIL("ERROR, the following models listed in $model_list do not exist in model info file $minfo_file:\n$err_msg\n", 1, $FH_HR);
}
}
# if --msub used ($msub_file) validate all models listed
# in $msub_file are in model info file
my %mdl_sub_H = ();
if(defined $msub_file) {
my $err_msg = validate_and_parse_sub_file($msub_file, \@mdl_info_AH, \%mdl_sub_H, $FH_HR);