-
Notifications
You must be signed in to change notification settings - Fork 6
/
validate_gff3.pl
231 lines (183 loc) · 8.62 KB
/
validate_gff3.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/perl
=head1 NAME
validate_gff3.pl
=head1 SYNOPSIS
validate_gff3.pl -gff3_file <gff3_file> [-ontology_file <ontology_file1> -ontology_file <ontology_file1> ...]
-out <out_file_prefix> -config <config_file>
[-db_type <db_type>] [-db_name <db_name>] [-username <username>] [-password <password>]
[-verbose <0|1|2>] [-silent <0|1>]
=head1 DESCRIPTION
This script analyzes a gff3 file and validates a number of points. It uses the GFF3::Validator module for analysis. For
further information on analysis steps, please refer to validate_gff3.pod.
=head1 USAGE
The script uses a MySQL or SQLite database to analyze the gff3 file. The gff3 file is parsed and
content relevant to the analysis is loaded into the database. Use of database (as opposed to
performing analysis in memory) makes processing of large files feasible and significatly
increases overall processing speed. At the end of the analysis a
report is generated that lists errors and warnings ordered by line numbers. The report file
can be easily processed using grep and other Unix text processing tools.
The usage of the script follows with descriptions of command-line parameters:
validate_gff3.pl -gff3_file <gff3_file> [-ontology_file <ontology_file1> -ontology_file <ontology_file1> ...]
-out <out_file_prefix> -config <config_file>
[-db_type <db_type>] [-db_dir <db_dir>]
[-dbname <dbname>] [-username <username>] [-password <password>]
[-verbose <0|1|2>] [-silent <0|1>]
-gff3_file : (Required) Name of gff3 file to process.
-ontology_file : (Optional) Name of ontology file, multiple files can be specified.
Command-line ontology files and ontology files provided as directives
are merged and used for analysis. If neither is provided or is not accesible,
default ontology file is retrieved and used.
-out : (Required) Prefix to name log and report files, these become <out>.log and <out>.report
-config : (Required) Name of config file (see documentation in validate_gff3.cfg provided in the package
for further details).
-db_type : (Optional) Type of database ('mysql' or 'sqlite').
Defaults to 'mysql'.
-db_dir : (Optional) Directory to store temp sqlite database files
If not available, retrieved from config file (temp_dir param)
-dbname : (Optional) Name of MySQL database/SQLite db file to use for analysis.
If not available, retrieved from config file.
If db_type is 'sqlite' and no dbname is specified and none available in config file, a temp db is used
-username : (Optional) Username for analysis database (must have write privileges).
If not available, retrieved from config file.
If not available, defaults to "".
-password : (Optional) Password for analysis database.
If not available, retrieved from config file.
If not available, defaults to "".
-verbose : (Optional) Verbosity of logging.
Values:
1: Initialization information
2: + Progress information
3: + Error messages
If not available, defaults to 2.
-silent : (Optional) Whether to suppress logging to screen
Values:
0: Log to screen
1: Don't log to screen
If not available, defaults to 0.
-max_messages : (Optional) Whether to report all errors/warnings
Values:
0: Report all messages
<number>: Exit and report after <number> messages
If not available, defaults to 0.
=cut
use strict;
use FindBin::Real qw(Bin);
use lib Bin() . "/lib";
use GFF3::Validator;
use Carp;
use Getopt::Long;
# Usage
my $usage = qq[$0 -gff3_file <gff3_file> [-ontology_file <ontology_file1> -ontology_file <ontology_file1> ...]
-out <out_file_prefix> -config <config_file>
[-db_type <db_type>] [-db_dir <db_dir>]
[-dbname <dbname>] [-username <username>] [-password <password>]
[-verbose <0|1|2>] [-silent <0|1>]];
# Parse command-line params
my $gff3_file;
my @ontology_files;
my $out;
my $config;
my $db_type;
my $db_dir;
my $dbname;
my $username;
my $password;
my $verbose;
my $silent;
my $max_messages;
my $result = GetOptions ("gff3_file=s" => \$gff3_file,
"ontology_files=s" => \@ontology_files,
"out=s" => \$out,
"config=s" => \$config,
"db_type=s" => \$db_type,
"db_dir=s" => \$db_dir,
"dbname=s" => \$dbname,
"username=s" => \$username,
"password=s" => \$password,
"verbose=s" => \$verbose,
"silent=s" => \$silent,
"max_messages=s" => \$max_messages,
) or die("Usage: $usage\n");
# Check command-line params
if (!$gff3_file or !$out or !$config ) {
die("Usage: $usage\n");
}
# Parse config file
my $config_obj = Config::General->new(-ConfigFile => $config, -CComments => 0);
my %config = $config_obj->getall;
# Populate defaults from config
$db_type = lc($db_type) || 'mysql';
croak("Unrecognized database type ($db_type)!") unless $db_type =~ /^(mysql|sqlite)$/;
$db_dir ||= $config{temp_dir};
croak("Cannot determine db dir!") unless $db_dir;
my $datasource;
if ($dbname && $db_type eq 'mysql') {
$datasource = "DBI:mysql:dbname=$dbname";
}
elsif ($dbname && $db_type eq 'sqlite') {
$datasource = "DBI:SQLite:dbname=$dbname";
}
elsif (!$dbname && $db_type eq 'mysql') {
$datasource = $config{datasource};
}
elsif (!$dbname && $db_type eq 'sqlite') {
my ($temp_fh, $temp_file) = File::Temp::tempfile("validate_gff3_sqlite_XXXXX",
DIR => $db_dir,
SUFFIX => '.db',
UNLINK => 1);
$datasource = "DBI:SQLite:dbname=$temp_file";
}
else {
$datasource = $config{datasource}; # Placeholder
}
croak("Cannot determine database name!") unless $datasource;
# Prepare params
my $log_file = "$out.log";
my $report_file = "$out.report";
# Create validator object
my $validator = GFF3::Validator->new(-config => $config,
-gff3_file => $gff3_file,
-datasource => $datasource,
-username => $username,
-password => $password,
-verbose => $verbose,
-silent => $silent,
-max_messages => $max_messages,
-log_file => $log_file,
-report_file => $report_file,
-ontology_files => \@ontology_files,
-table_id => "", # Currently do not use table id feature within the command-line version
);
# Create/Reset tables to store the data
$validator->create_tables;
# Load gff3 analysis database
$validator->load_analysis_dbs;
# Validate unique ids
$validator->validate_unique_ids;
# Load ontology(s) into memory
$validator->load_ontology;
# Validate ontology terms
$validator->validate_ontology_terms;
# Validate parentage
$validator->validate_parentage;
# Validate derives_from
$validator->validate_derives_from;
# Dump an error report
$validator->dump_report;
# Cleanup
# $validator->cleanup; # Currently, do not clean up within the command-line version
$validator->log("# [END]");
=head1 SEE ALSO
=head1 AUTHOR
Payan Canaran <canaran@cshl.edu>
=head1 VERSION
$Id: validate_gff3.pl,v 1.1.1.1 2010-01-25 15:46:09 tharris Exp $
=head1 CREDITS
- SQLite support adapted from patch contributed by Robert Buels <rmb32@cornell.edu>.
=head1 COPYRIGHT AND LICENSE
Copyright (c) 2006-2007 Cold Spring Harbor Laboratory
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself. See DISCLAIMER.txt for
disclaimers of warranty.
=cut
1;