-
Notifications
You must be signed in to change notification settings - Fork 34
/
order_fastx.pl
390 lines (279 loc) · 13.5 KB
/
order_fastx.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
#!/usr/bin/perl
#######
# POD #
#######
=pod
=head1 NAME
C<order_fastx.pl> - order sequences in FASTA or FASTQ files
=head1 SYNOPSIS
C<perl order_fastx.pl -i infile.fasta -l order_id_list.txt
E<gt> ordered.fasta>
=head1 DESCRIPTION
Order sequence entries in FASTA or FASTQ sequence files according to
an ID list with a given order. Beware, the IDs in the order list
have to be B<identical> to the entire IDs in the sequence file.
However, the ">" or "@" ID identifiers of FASTA or FASTQ files,
respectively, can be omitted in the ID list.
The file type is detected automatically. But, you can set the file
type manually with option B<-f>. FASTQ format assumes B<four> lines
per read, if this is not the case run the FASTQ file through
L<C<fastx_fix.pl>|/fastx_fix> or use Heng Li's L<C<seqtk
seq>|https://github.com/lh3/seqtk>:
C<seqtk seq -l 0 infile.fq E<gt> outfile.fq>
The script can also be used to pull a subset of sequences in the ID
list from the sequence file. Probably best to set option flag B<-s>
in this case, see L<"Optional options"> below. But, rather use
L<C<filter_fastx.pl>|/filter_fastx>.
=head1 OPTIONS
=head2 Mandatory options
=over 20
=item B<-i>=I<str>, B<-input>=I<str>
Input FASTA or FASTQ file
=item B<-l>=I<str>, B<-list>=I<str>
List with sequence IDs in specified order
=back
=head2 Optional options
=over 20
=item B<-h>, B<-help>
Help (perldoc POD)
=item B<-f>=I<fasta|fastq>, B<-file_type>=I<fasta|fastq>
Set the file type manually
=item B<-e>, B<-error_files>
Write missing IDs in the seq file or the order ID list without an
equivalent in the other to error files instead of C<STDERR> (see
L<"OUTPUT"> below)
=item B<-s>, B<-skip_errors>
Skip missing ID error statements, excludes option B<-e>
=item B<-v>, B<-version>
Print version number to C<STDERR>
=back
=head1 OUTPUT
=over 20
=item C<STDOUT>
The newly ordered sequences are printed to C<STDOUT>. Redirect or
pipe into another tool as needed.
=item (F<order_ids_missing.txt>)
If IDs in the order list are missing in the sequence file with
option B<-e>
=item (F<seq_ids_missing.txt>)
If IDs in the sequence file are missing in the order ID list with
option B<-e>
=back
=head1 EXAMPLES
=over
=item C<perl order_fastx.pl -i infile.fq -l order_id_list.txt -s -f
fastq E<gt> ordered.fq>
=item C<perl order_fastx.pl -i infile.fasta -l order_id_list.txt -e
E<gt> ordered.fasta>
=back
=head1 VERSION
0.1 20-11-2014
=head1 AUTHOR
Andreas Leimbach aleimba[at]gmx[dot]de
=head1 LICENSE
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 (GPLv3) of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see L<http://www.gnu.org/licenses/>.
=cut
########
# MAIN #
########
use strict;
use warnings;
use autodie;
use Getopt::Long;
use Pod::Usage;
### Get the options with Getopt::Long
my $Seq_File; # sequence file to order sequences in
my $Order_List; # order ID list for seq file
my $File_Type; # set file type; otherwise detect file type by file extension
my $Opt_Error_Files; # print missing IDs not found in order list or seq file to error files instead of STDERR
my $Opt_Skip_Errors; # skip missing IDs error statements/files
my $VERSION = 0.1;
my ($Opt_Version, $Opt_Help);
GetOptions ('input=s' => \$Seq_File,
'list=s' => \$Order_List,
'file_type=s' => \$File_Type,
'error_files' => \$Opt_Error_Files,
'skip_errors' => \$Opt_Skip_Errors,
'version' => \$Opt_Version,
'help|?' => \$Opt_Help);
### Run perldoc on POD
pod2usage(-verbose => 2) if ($Opt_Help);
die "$0 $VERSION\n" if ($Opt_Version);
if (!$Seq_File || !$Order_List) {
my $warning = "\n### Fatal error: Options '-i' or '-l' or their arguments are missing!\n";
pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
}
### Enforce mandatory or optional options
die "\n### Fatal error:\nUnknown file type '$File_Type' given with option '-f'. Please choose from either 'fasta' or 'fastq'!\n" if ($File_Type && $File_Type !~ /(fasta|fastq)/i);
warn "\n### Warning:\nIgnoring option flag '-e', because option '-s' set at the same time!\n\n" if ($Opt_Error_Files && $Opt_Skip_Errors);
### Order input FASTA/FASTQ file according to a given list
open (my $Order_List_Fh, "<", "$Order_List");
open (my $Input_Fh, "<", "$Seq_File"); # pipe from STDIN not working because of 'seek' on filehandle
get_file_type() if (!$File_Type); # subroutine to determine file type by file extension
my %Order_List_IDs; # store order IDs and indicate if found in seq file
my %Seq_File_IDs; # store seq file IDs and indicate if present in order list
my $Next_Fasta_ID; # for multi-line FASTA input files to store next entry header/ID line while parsing in subroutine 'get_fastx_entry'
my $Parse_Run = 1; # indicate FIRST parsing cycle through seq file to collect all seq IDs
while (my $ord_id = <$Order_List_Fh>) {
chomp $ord_id;
next if ($ord_id =~ /^\s*$/); # skip emtpy lines in order list
$ord_id =~ s/^(>|@)//; # remove ">/@" for WHOLE string regex match -> ID in order list can be given with ">/@" or without (will be appended again in print)
if ($Order_List_IDs{$ord_id}) {
die "\n### Fatal error:\n'$ord_id' exists several times in '$Order_List' and IDs should be unique!\n";
} else {
$Order_List_IDs{$ord_id} = 1; # changes to 2 if ID was found in seq file
}
while (<$Input_Fh>) {
if (/^\s*$/) { # skip empty lines in input
warn "\n### Warning:\nFASTQ file includes empty lines, which is unusual. Parsing the FASTQ reads might fail so check the output file afterwards if the script didn't quit with a fatal error. However, consider running the input FASTQ file through 'fix_fastx.pl'!\n\n" if ($File_Type =~ /fastq/i);
next;
}
chomp;
# FASTA file
if ($File_Type =~ /fasta/i) {
$_ = get_fastx_entry($_); # subroutine to read one FASTA sequence entry (seq in multi-line or not), returns anonymous array
# FASTQ file
} elsif ($File_Type =~ /fastq/i) {
$_ = get_fastx_entry($_); # subroutine to read one FASTQ read composed of FOUR mandatory lines, returns reference to array
}
if ($Seq_File_IDs{$_->[0]} && $Parse_Run == 1) { # only for first parse cycle, subsequent parsings of course will find the same IDs
die "\n### Fatal error:\n'$_->[0]' exists several times in '$Seq_File' and IDs should be unique!\n";
} elsif (!$Seq_File_IDs{$_->[0]}) {
$Seq_File_IDs{$_->[0]} = 1; # changes to 2 if present in order list
}
if ($ord_id =~ /^$_->[0]$/) { # order ID hit in seq file with the WHOLE string; de-reference array
$Order_List_IDs{$ord_id} = 2; # set to ID found
$Seq_File_IDs{$_->[0]} = 2;
print ">$_->[0]\n$_->[1]\n\n" if ($File_Type =~ /fasta/i); # print seq entry to STDOUT
print "\@$_->[0]\n$_->[1]\n$_->[2]\n$_->[3]\n" if ($File_Type =~ /fastq/i);
next if ($Parse_Run == 1); # parse the complete seq file once (skip 'last' below) to collect all seq IDs
last; # jump out of seq file 'while'
}
}
# rewind seq file for next order list ID
$Next_Fasta_ID = '';
seek $Input_Fh, 0, 0;
$. = 0; # set line number of seq file to 0 (seek doesn't do it automatically)
$Parse_Run = 0;
}
close $Input_Fh;
close $Order_List_Fh;
### Print order and seq IDs that were not found in seq file or in order list, resp.
if (!$Opt_Skip_Errors) {
# order IDs not found in seq file
missing_IDs(\%Order_List_IDs, 'order_ids_missing.txt', 'order'); # subroutine to identify and print missing IDs
# seq file IDs not found in order list
missing_IDs(\%Seq_File_IDs, 'seq_ids_missing.txt', 'sequence'); # subroutine
}
exit;
#############
#Subroutines#
#############
### Test for output file existence and give warning to STDERR
sub file_exist {
my $file = shift;
if (-e $file) {
warn "\n### Warning:\nThe error file '$file' exists already, the current errors will be appended to the existing file!\n";
return 1;
}
return 0;
}
### Get sequence entries from FASTA/Q file
sub get_fastx_entry {
my $line = shift;
# possible multi-line seq in FASTA
if ($File_Type =~ /fasta/i) {
my ($seq, $header);
if ($. == 1) { # first line of file
die "\n### Fatal error:\nNot a FASTA input file, first line of file should be a FASTA ID/header line and start with a '>':\n$line\n" if ($line !~ /^>/);
$header = $line;
} elsif ($Next_Fasta_ID) {
$header = $Next_Fasta_ID;
$seq = $line;
}
while (<$Input_Fh>) {
chomp;
$Next_Fasta_ID = $_ if (/^>/); # store ID/header for next seq entry
$header =~ s/^>//; # remove '>' for WHOLE string regex match in MAIN
return [$header, $seq] if (/^>/); # return anonymous array with current header and seq
$seq .= $_; # concatenate multi-line seq
}
$header =~ s/^>//; # see above
return [$header, $seq] if eof;
# FASTQ: FOUR lines for each FASTQ read (seq-ID, sequence, qual-ID [optional], qual)
} elsif ($File_Type =~ /fastq/i) {
my @fastq_read;
# read name/ID, line 1
my $seq_id = $line;
die "\n### Fatal error:\nThis read doesn't have a sequence identifier/read name according to FASTQ specs, it should begin with a '\@':\n$seq_id\n" if ($seq_id !~ /^@.+/);
$seq_id =~ s/^@//; # remove '@' to make comparable to $qual_id and for WHOLE string regex match in MAIN
push(@fastq_read, $seq_id);
# sequence, line 2
chomp (my $seq = <$Input_Fh>);
die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /\s+/);
die "\n### Fatal error:\nRead '$seq_id' has a IUPAC degenerate base (except for 'N') or non-nucleotide character in its sequence, which is not allowed according to FASTQ specs:\n$seq\n" if ($seq =~ /[^acgtun]/i);
push(@fastq_read, $seq);
# optional quality ID, line 3
chomp (my $qual_id = <$Input_Fh>);
die "\n### Fatal error:\nThe optional sequence identifier/read name for the quality line of read '$seq_id' is not according to FASTQ specs, it should begin with a '+':\n$qual_id\n" if ($qual_id !~ /^\+/);
push(@fastq_read, $qual_id);
$qual_id =~ s/^\+//; # if optional ID is present check if equal to $seq_id in line 1
die "\n### Fatal error:\nThe sequence identifier/read name of read '$seq_id' doesn't fit to the optional ID in the quality line:\n$qual_id\n" if ($qual_id && $qual_id ne $seq_id);
# quality, line 4
chomp (my $qual = <$Input_Fh>);
die "\n### Fatal error:\nRead '$seq_id' has a whitespace in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /\s+/);
die "\n### Fatal error:\nRead '$seq_id' has a non-ASCII character in its quality values, which is not allowed according to FASTQ specs:\n$qual\n" if ($qual =~ /[^[:ascii]]/);
die "\n### Fatal error:\nThe quality line of read '$seq_id' doesn't have the same number of symbols as letters in the sequence:\n$seq\n$qual\n" if (length $qual != length $seq);
push(@fastq_read, $qual);
return \@fastq_read; # return array-ref
}
return 0;
}
### Determine file type via file extension (FASTA or FASTQ)
sub get_file_type {
if ($Seq_File =~ /.+\.(fa|fas|fasta|ffn|fna|frn|fsa)$/) { # use "|fsa)(\.gz)*$" if unzip inside script
$File_Type = 'fasta';
} elsif ($Seq_File =~ /.+\.(fastq|fq)$/) {
$File_Type = 'fastq';
}
die "\n### Fatal error:\nFile type could not be automatically detected. Sure this is a FASTA/Q file? If yes, you can force the file type by setting option '-f' to either 'fasta' or 'fastq'!\n" if (!$File_Type);
print STDERR "Detected file type: $File_Type\n";
return 1;
}
### Identify and print IDs with no hit in order list or seq file
sub missing_IDs {
my ($hash_ref, $error_file, $mode) = @_;
my @missed = grep ($hash_ref->{$_} == 1, keys %$hash_ref); # set to 2 if hit, 1 if "only" present
if (@missed) {
file_exist($error_file) if ($Opt_Error_Files); # subroutine
open (my $error_fh, ">>", $error_file) if ($Opt_Error_Files);
print STDERR "\n### Warning:\nSome $mode IDs were not found in '";
if ($mode eq 'order') {
print STDERR "$Seq_File";
} elsif ($mode eq 'sequence') {
print STDERR "$Order_List";
}
print STDERR "', listed ";
print STDERR "below:\n" if (!$Opt_Error_Files);
print STDERR "in error file '$error_file'!\n" if ($Opt_Error_Files);
foreach (sort @missed) {
if (!$Opt_Error_Files) {
print STDERR "$_\t"; # separated by tab
} elsif ($Opt_Error_Files) {
print $error_fh "$_\n"; # separated by newline
}
}
print STDERR "\n" if (!$Opt_Error_Files); # final newline for STDERR print
close $error_fh if ($Opt_Error_Files);
}
return 1;
}