-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrnaml2off.pl
executable file
·629 lines (539 loc) · 20.5 KB
/
rnaml2off.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
#!/usr/bin/env perl
#===============================================================================
#
# FILE: rnaml2off.pl
#
# USAGE: ./rnaml2off.pl
#
# DESCRIPTION:
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Christoph Kaempf (CK), kaempf@bioinf.uni-leipzig.de
# ORGANIZATION:
# VERSION: 1.0
# CREATED: 05.08.2012 15:28:44
# REVISION: ---
#===============================================================================
## Loading modules and initializing variables ##
use strict;
use warnings;
use XML::Simple;
use Data::Dumper;
use Getopt::Long;
use File::Basename;
use File::Find;
use File::Spec qw(rel2abs);
use Log::Log4perl qw(get_logger :levels);
use Pod::Usage;
use Path::Class;
my $module_dir = dirname(__FILE__);
push(@INC, $module_dir);
## Configure Getopt::Long ##
Getopt::Long::Configure ("bundling");
my $dumper = 0;
my $nota = 'LW';
my $help = '';
my $man = 0;
my $verbose = 0;
my @files = ();
my @directories = ();
GetOptions(
"dumper" => \$dumper,
"file|f=s" => \@files,
"directory|d=s" => \@directories,
"notation|n:s" => \$nota,
"verbose|v+" => \$verbose,
"help|h" => \$help,
"man|m" => \$man) or pod2usage(-verbose => 1) && exit;
pod2usage(-verbose => 1) && exit if ( $help );
pod2usage(-verbose => 2) && exit if ( $man );
pod2usage(-verbose => 1) && exit if ( scalar(@files) == 0 && scalar(@directories) == 0 );
# allow coma seperated values as input
@files = split(/,/,join(',',@files));
@directories = split(/,/,join(',',@directories));
###############################################################################
#
# Logger initiation
#
###############################################################################
my $log4perl_conf = file(dirname(__FILE__), "RNAprobing.log.conf");
# Apply configuration to the logger
Log::Log4perl->init("$log4perl_conf");
# Get the logger
my $logger_name = "RNAprobing";
my $logger = &configureLogger($verbose, $logger_name);
$logger->info("++++ ".__FILE__." has been started. ++++");
## Lookup @files and @directories for rna1.xml files and insert the found in @rnaml_files
## - find all rna1.xml files in the @directories given and add them to @files
if ( scalar(@directories ) != 0 ){
my @checked_directories;
foreach (@directories) {
if ( -d $_ ) {
$logger->info("$_ is a directory");
push( @checked_directories, $_ );
} else {
$logger->info("$_ isn't a directory");
}
}
$logger->error("No valid directory given.") && exit 0
if ( scalar(@checked_directories) == 0 && scalar(@files) == 0 );
$logger->info("Looking for rna1.xml files in directories:\n".
join("\n", @checked_directories));
find(\&wanted, @checked_directories);
}
## - check found files and add them to @rnaml_files if they passed the checks
my $rnaml_files = &checkFiles(@files);
## Check for correct notation selection ##
my $notation = "";
if ($nota eq "LW") {
$logger->info("Leontis-Westhof notation selected.");
$notation = "LW";
} elsif ($nota eq "BP" ) {
$logger->info("Base pair notation selected.");
$notation = "BP";
} elsif ($nota eq "LW+" ) {
$logger->info("Leontis-Westhof notation with nucleotide/amino acid interactions selected.");
$notation = "LW+";
} elsif ($nota eq "LG" ) {
$logger->info("Lee-Guttel notation selected.");
$notation = "LG";
} else {
$logger->info("Notation set to Leontis-Westhof.");
$notation = "LW";
}
###############################################################
##
## Application section
##
###############################################################
# create XML::Simple object
my $xml = new XML::Simple;
# parse XML files
foreach my $rnaml_file ( @{$rnaml_files} ){
## $data - is the XML::Simple object that holds the representation of the RNAML file
my $data = $xml->XMLin($rnaml_file, ForceArray => 1);
my($filename, $directories, $suffix) = fileparse($rnaml_file);
## $off_files_list name of the file that holds a list of all .off files in a certain directory
my $off_files_list = $directories ."off_files.list";
## $fasta_files_list name of the file that holds a list of all .fasta files in a certain directory
my $fasta_files_list = $directories ."fasta_files.list";
foreach my $key ( keys %{$data->{'molecule'}}){
## $off_file = file name for the output file (off = our file format)
$filename =~ s/-rna1\.xml$//g;
my $ndb_id = $filename;
$logger->debug("NDB ID of $rnaml_file is $ndb_id.");
$logger->debug("Key of actual molecule is $key.");
my $fasta_file = $directories . $ndb_id .".". $key .".fa";
my $off_file = $directories . $ndb_id .".". $key .".off";
## try to open the file $off_file
open(my $off_fh, ">", $off_file) or die("Can't open $off_file.");
$logger->debug("Create output file $off_file.");
## append $off_file to $off_files_list
open( my $off_list_fh ,">>", $off_files_list) or die("Can't open $off_files_list.");
print $off_list_fh $off_file ."\n";
close($off_list_fh);
## try to open the file $fasta_file
open(my $fasta_out_fh, ">", $fasta_file) or die("Can't open $fasta_file.");
$logger->debug("Create output file $fasta_file.");
## append $fasta_file to $fasta_files_list
open( my $fasta_list_fh ,">>", $fasta_files_list) or die("Can't open $fasta_files_list.");
print $fasta_list_fh $fasta_file ."\n";
close($fasta_list_fh);
## $ndb_id - holds the id of the molecule
print $off_fh "> $ndb_id.$key\n" ;
print $fasta_out_fh "> $ndb_id.$key\n";
$logger->debug("Header: > $ndb_id");
## $sequence - holds the RNA sequence
my $sequence = join("", @{$data->{'molecule'}->{$key}->{'sequence'}[0]->{'seq-data'}});
$sequence = &trimString($sequence);
print $off_fh "$sequence\n";
print $fasta_out_fh "$sequence\n";
$logger->debug("Sequence: $sequence");
## @dotBracket - holds all Watson-Crick base pairs in Dot-Bracket form
my @dotBracket;
# fill @dotBracket with length($sequence) dots
@dotBracket = split('', '.' x length($sequence) );
$logger->debug(@dotBracket);
## $basePairs - is an array reference on an array which contains all information about all base pairs
my $basePairs = $data->{'molecule'}->{$key}->{'structure'}[0]->{'model'}->{'?'}->{'str-annotation'}->[0]->{'base-pair'};
if ($dumper) {
$logger->level($DEBUG);
$logger->debug(Dumper($basePairs));
}
## %bp - is a hash holding every base-base contact as array under a unique hash-key
my %bp = ();
## every 'base pair' is inserted into %bp
## and in case of standard Watson-Crick base pairs the @dotBracket array is extended
if ($notation eq "LW"){
my @colSize;
my %fivePedge_bpLW = ();
my @bracket_stack = (0);
foreach (@{$basePairs}){
# read in all base pairs and store them in %bp
my @bpLW = &LWnotation($_);
my $id = &makeId(\@bpLW);
$bp{$id} = \@bpLW;
}
my @bpStart5p = sort keyCmp keys(%bp);
my @old_bpLW = undef;
foreach my $key (@bpStart5p){
my @bpLW = @{ $bp{$key} };
@old_bpLW = @bpLW unless ( @old_bpLW );
## insert opening and closing bracket for every standard
## Watson-Crick base pair
@dotBracket = &insertBrackets(\@bpLW, \@old_bpLW, \@dotBracket, \@bracket_stack, $filename, $sequence );
@colSize = &colSizeUpdate($bpLW[0], $bpLW[1], $bpLW[2], \@colSize);
@old_bpLW = @bpLW;
}
## $dbString - is the output string of the Dot-Bracket notation
my $dbLine = join("", @dotBracket);
print $off_fh "$dbLine\n";
$logger->debug($dbLine);
## $colSizeLine - assemble the line that holds the column sizes
if ( scalar(@colSize) > 0 ) {
my $colSizeLine = "# ". length($notation) .";".join(";", @colSize);
print $off_fh $colSizeLine."\n" ;
$logger->debug($colSizeLine);
}
# sort the keys with a specific subroutine
foreach my $key (@bpStart5p){
my $line = "# ".$notation;
$line .= &makeColString($bp{$key}, \@colSize);
print $off_fh "$line\n";
$logger->debug("$line");
}
}
if ($notation eq "BP"){
}
if ($notation eq "LW+"){
}
if ($notation eq "LG"){
}
# clear %bp or the next
%bp = ();
close($off_fh);
close($fasta_out_fh);
}
}
###############################################################
##
## Subroutine section
##
###############################################################
###############################################################
##
## &checkFiles(@filesToBeChecked)
## - Performs file checks and returns an array with all succesfully checked files
## - @filesToBeChecked = array of files to be checked
##
###############################################################
sub checkFiles {
my @testfiles = @_;
my @checkedfiles = ();
my $logger = get_logger("RNAprobing");
foreach (@testfiles) {
$logger->info("Perform file checks for file $_");
if ( -f $_){
$logger->info("$_ is a plain file");
if ( -r $_) {
my $abs_path = File::Spec->rel2abs( $_ );
$logger->info("$_ can be accessed");
push(@checkedfiles, $abs_path );
$logger->debug("Absolute path: $abs_path");
} else {
$logger->error("Can not access $_");
}
} else {
$logger->error("$_ is not a plain file");
$logger->error("$_ is a directory") if ( -d $_);
}
}
return \@checkedfiles;
}
###############################################################################
##
## Invocation - \&wanted
## - Subroutine used by File::Find
## - Superugly this usage
##
###############################################################################
sub wanted {
my $logger = get_logger("RNAprobing");
if ( $_ =~ /rna1\.xml$/ ) {
$logger->info("$_ is a rna1.xml file");
# @files is a global variable
push(@files, $File::Find::name);
} else {
$logger->info("$_ is not a rna1.xml file");
}
}
###############################################################
#
# Invocation - &configureLogger($verbosityLevel)
# - Configures and initialzes the Logger
# - $verbosityLevel = scalar value that sets log level
# -- 0 => $WARN
# -- 1 => $INFO
# -- 2 => $DEBUG
#
###############################################################
sub configureLogger{
## Configure the logger ##
my $verbose = shift;
my $logger = get_logger("RNAprobing");
$logger->info("Verbosity level: $verbose");
SELECT:{
if ($verbose == 0){ $logger->level($WARN) ; $logger->debug("Log level is WARN") ; last SELECT; }
if ($verbose == 1){ $logger->level($INFO) ; $logger->debug("Log level is INFO") ; last SELECT; }
if ($verbose >= 2){ $logger->level($DEBUG); $logger->debug("Log level is DEBUG") ; last SELECT; }
else {$logger->level($ERROR); $logger->debug("Log level is ERROR") ; last SELECT; }
}
return $logger;
}
###############################################################
#
# Invocation - &trimString($string)
# remove all white spaces and newlines from $string
#
###############################################################
sub trimString{
my $string = $_[0];
$string =~s/[\n,\s]//g;
return $string;
};
###############################################################
#
# Invocation - &clearHash()
# guess what? Never used but nice-to-have
#
###############################################################
sub clearHash{
my %hash = %{$_[0]};
for (keys %hash){
delete $hash{$_};
}
return %hash;
};
###############################################################
#
# Invocation - &insertBrackets($pos5p, $pos3p, $edge5p, $edge3p, \@dotBracket, \@bracket_stack)
# inserts opening and closing Bracket in Dot-Bracket string
#
###############################################################
sub insertBrackets{
my ($pos5P, $pos3P, $lw_not, $edge5P, $edge3P) = @{$_[0]};
# my $pos3P = $_[1];
# my $edge5P = $_[2];
# my $edge3P = $_[3];
my ($old_pos5P, $old_pos3P, $old_lw_not, $old_edge5P, $old_edge3P) = @{$_[1]};
my @dotBracket = @{$_[2]};
my $dbn = join("", @dotBracket);
my $bracket_stack = $_[3];
my $filename = $_[4];
my @sequence = split("", $_[5]);
my $bracket_type = 0;
my %opening_brackets = ( 0 => '(', 1 => '[', 2 => '{', 3 => '<' );
my %closing_brackets = ( 0 => ')', 1 => ']', 2 => '}', 3 => '>' );
$logger->debug("$pos5P $pos3P $edge5P $edge3P");
$logger->debug("$dotBracket[$pos5P-1] $dotBracket[$pos3P-1]");
if ($edge5P eq 'W' && $edge3P eq 'W' ||
$edge5P eq 'E' && $edge3P eq 'E' && $sequence[$pos5P-1] eq 'U' && $sequence[$pos3P-1] eq 'G' ||
$edge5P eq 'E' && $edge3P eq 'E' && $sequence[$pos5P-1] eq 'G' && $sequence[$pos3P-1] eq 'U') {
for (my $i = 0; $i < scalar(@{$bracket_stack}); $i++ ) {
$logger->debug("Bracket type is $i");
$logger->debug("Bracket stack contains ".scalar(@{$bracket_stack})." element(s).");
if ( $pos5P < ${$bracket_stack}[$i] && $pos3P < ${$bracket_stack}[$i] || $pos5P > ${$bracket_stack}[$i] && $pos3P > ${$bracket_stack}[$i] ) {
${$bracket_stack}[$i] = $pos3P;
$bracket_type = $i;
$logger->debug(${$bracket_stack}[$i]);
last;
} elsif ( $pos5P < ${$bracket_stack}[$i] && $pos3P > ${$bracket_stack}[$i] && $i + 1 == scalar(@{$bracket_stack}) ) {
push(@{$bracket_stack}, $pos3P);
$logger->info("Detected pseudoknot.");
$bracket_type = $i + 1;
last;
} elsif ($pos5P < ${$bracket_stack}[$i] && $pos3P > ${$bracket_stack}[$i]) {
$logger->info("Detected pseudoknot.");
$bracket_type = $i + 1;
} else {
$logger->error($filename.": Encountered a problem with the ".
"pseudoknot detection!");
$logger->error("$filename : edge $sequence[$pos5P-1]".
"$pos5P-$sequence[$pos3P-1]$pos3P probably ".
"collides with edge $sequence[$old_pos5P-1]".
"$old_pos5P-$sequence[$old_pos3P-1]$old_pos3P");
}
}
if ( $bracket_type > 3 ){
$logger->error($filename.": More than four entangled pseudoknots! HELP!");
} else {
$dotBracket[$pos5P-1] = $opening_brackets{$bracket_type};
$dotBracket[$pos3P-1] = $closing_brackets{$bracket_type};
}
}
$dbn = join("", @dotBracket);
$logger->debug($dbn);
return @dotBracket;
};
###############################################################
#
# Invocation - internally done via sort()
# sort routine for the hash keys used within %bp
#
###############################################################
sub keyCmp{
my @a = split(" ", $a);
my @b = split(" ", $b);
if ($a[0] < $b[0]){
return -1;
}
if ($a[0] > $b[0]){
return 1;
}
else {
if ($a[1] < $b[1]){
return -1;
}
if ($a[1] > $b[1]){
return 1;
}
else {
return $a[3] cmp $b[3];
}
}
};
###############################################################
#
# Invocation - &LWnotation($_)
#
# Returns an array with five elements describing a base pair:
# 0: sequence position of the 5' base
# 1: sequence position of the 3' base
# 2: three letter string describing bond orientation,
# 5' edge and 3' edge
# 3: edge type of the 5' base
# 4: edge type of the 3' base
#
###############################################################
sub LWnotation{
my $pos3p = $_->{'base-id-3p'}[0]->{'base-id'}[0]->{'position'}[0];
my $pos5p = $_->{'base-id-5p'}[0]->{'base-id'}[0]->{'position'}[0];
my $edge3p = uc(&ndb2lw($_->{'edge-3p'}[0]));
my $edge5p = uc(&ndb2lw($_->{'edge-5p'}[0]));
my @bpLW = ();
$bpLW[0] = $pos5p;
$bpLW[1] = $pos3p;
$bpLW[2] = $_->{'bond-orientation'}[0].$edge5p.$edge3p;
$bpLW[3] = $edge5p;
$bpLW[4] = $edge3p;
return @bpLW;
}
###############################################################
#
# Invocation - &makeId($_)
# - ID is something like "1 10 cWW"
###############################################################
sub makeId{
my @bpLW = @{$_[0]};
my $id;
foreach my $i (@bpLW){
$id .= $i." ";
}
chop($id);
$logger->debug("ID: $id");
return $id;
}
###############################################################
#
# Invocation - &colSizeUpdate(\$bpLW[0], $bpLW[1], $bpLW[2], \@colSize)
#
###############################################################
sub colSizeUpdate{
my $pos5p = shift;
my $pos3p = shift;
my $edge_notation = shift;
my @bpLW = ($pos5p, $pos3p, $edge_notation);
my $colSize = shift ;
for (my $i = 0; $i < scalar(@bpLW); $i++){
if ( ! defined $colSize->[$i] ){
$colSize->[$i] = 0;
}
if ( $colSize->[$i] < length($bpLW[$i]) + 2 ){
# the addition of 2 results in a more readable output
$colSize->[$i] = length($bpLW[$i]) + 2;
}
}
return @{$colSize};
}
###############################################################
#
# Invocation - &makeColString(\@{bp{$key}}, \@colSize, $i)
#
###############################################################
sub makeColString{
my $bpLW = shift;
my $colSize = shift;
my $logger = get_logger("RNAprobing");
my $string;
for (my $i = 0; $i < scalar(@{$colSize}); $i++) {
my $diff = $colSize->[$i] - length($bpLW->[$i]);
$string .= " " x $diff;
$string .= $bpLW->[$i];
}
$logger->debug("This is edge: $string");
return $string;
}
###############################################################
#
# Transformation from RNAML to $notation
# --------------------------------------
#
# Invocation - &ndb2lw($char)
# $char should contain exactly one character
# transform RNAML non-standard into standard Leontis-Westhof Notation
#
###############################################################
sub ndb2lw{
SELECT:{
# W = Watson-Crick Edge
if ($_[0] eq '+' || $_[0] eq '-' ){ return "W"; last SELECT; }
# E = Watson-Crick Edge involved in non-standard base pair
if ($_[0] eq 'W') { return "E"; last SELECT; }
# N = Non-identified Edge
if ($_[0] eq '.' || $_[0] eq '?' || $_[0] eq 'X' ){ return "N"; last SELECT; }
# M = Triplets and higher multiplets
# if ($_[0] eq '+' ){ return "M"; last SELECT; }
# T = Tertiary interactions
if ($_[0] eq '!' ){ return "T"; last SELECT; }
else { return $_[0] }
}
};
__END__
=head1 SYNOPSIS
rnaml2off.pl converts a bunch of rna1.xml files into .off and .fa files.
=head1 DESCRIPTION
Converts rna1.xml files into .off files. The .off files look like normal dot-bracket notation files, but also contain information about Leontis-Westhof style interactions and tertiary interactions between nucleotides.
Switches that don't define a value can be done in long or short form.
eg:
rnaml2off.pl --man
rnaml2off.pl -m
This perl script relies on the Log::Log4perl, Path::Class and XML::Simple modules. Please make sure the according packages are around on your system.
=head1 OPTIONS
=over 8
=item B<-h, --help>
Display help message
=item B< -m, --man>
Display man page
=item B<--dumper>
Switch for debugging purpose. If set all found base pair informations are logged.
=item B<-f, --file>
RNAML file that is transformed into OFF file. This flag can be given multiple times.
=item B<-d, --directory>
Directory recursively searched for RNAML files ending on rna1.xml.
=item B<-n, --notation>
=item B<-v, --verbose>
Increases the verbosity level. Can be used multiple times (highest level if used 3 or more times)
=cut