-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfillSequences.pl
executable file
·54 lines (46 loc) · 1.37 KB
/
fillSequences.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env perl
# Fills insertions back into the FASTA.
#
# Samuel S. Shepard (vfn4@cdc.gov)
# 2016, Centers for Disease Control & Prevention
use Getopt::Long;
GetOptions( 'fill-triplets|F' => \$fillTriplets );
if ( scalar(@ARGV) != 2 ) {
$message = "Usage:\n\tperl $0 <fasta> <ins.txt> [-F|--fill-triplets]\n";
die( $message . "\n" );
}
%inserts = ();
$/ = "\n";
$insertionTable = $ARGV[1];
open( INS, '<', $insertionTable ) or die("Cannot open $insertionTable for reading.\n");
@lines = <INS>;
chomp(@lines);
foreach $line (@lines) {
( $id, $pos, $insert ) = split( "\t", $line );
$inserts{$id}{$pos} = $insert;
}
close(INS);
# Process records.
$/ = ">";
open( FASTA, '<', $ARGV[0] ) or die("Cannot open $ARGV[0] for reading.\n");
while ( $record = <FASTA> ) {
chomp($record);
@lines = split( /\r\n|\n|\r/, $record );
$id = shift(@lines);
$sequence = lc( join( '', @lines ) );
$length = length($sequence);
if ( $length == 0 ) {
next;
}
$offset = 0;
foreach $pos ( sort { $a <=> $b } keys( %{ $inserts{$id} } ) ) {
$insert = $inserts{$id}{$pos};
if ( $fillTriplets && length($insert) % 3 != 0 ) {
next;
}
substr( $sequence, $pos + $offset, 0 ) = $insert;
$offset += length($insert);
}
print '>', $id, "\n", $sequence, "\n";
}
close(FASTA);