-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfasta_filter.pl
executable file
·109 lines (61 loc) · 1.84 KB
/
fasta_filter.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/perl
#fasta_filter.pl
#script to filter multi fasta for missing data (N)
#M Supple
#last modified 9 July 2015
#usage fasta_filter.pl <in.fasta> <threshold>
#threshold is the % Ns that causes a site to be removed
#ie if only want to remove sites that are all Ns, the threshold is 100
#requires BioPerl
#http://www.bioperl.org/wiki/HOWTO:Multiple_Sequence_Alignment
#produces a fasta with sites with too much missing data removed
use lib $ENV{PERL5LIB};
use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
use Bio::AlignIO;
use Bio::SimpleAlign;
#read in command line arguments
my ($infasta,$thresh)=@ARGV;
# Use Bio::AlignIO to read in the alignment
my $in = Bio::AlignIO->new(-file => $infasta, -format => 'fasta');
my $aln = $in->next_aln();
#basic pre stats
my $numsamples=$aln->num_sequences;
print "number of samples is originally $numsamples\n";
my $seqlen=$aln->length;
print "sequence length is originally $seqlen\n";
#count Ns in each column and record column numbers with %Ns >= threshold
my $ncount;
#iterate through each column (bp position)
my $i=1;
while($i<$seqlen)
{
$ncount=0;
#iterate through each entry
foreach my $seq ($aln->each_seq)
{
my $res=$seq->subseq($i,$i);
if ($res eq "N"){$ncount++}
}
if (($ncount/$numsamples) ge ($thresh/100))
{
#remove column and decrease counter
my @tmp=($i-1,$i-1);
my $poss=\@tmp;
$aln=$aln->remove_columns($poss);
$seqlen=$aln->length;
}
else {$i++}
}
#basic stats
$numsamples=$aln->num_sequences;
print "number of samples is now $numsamples\n";
$seqlen=$aln->length;
print "sequence length is now $seqlen\n";
#print to file
my $outfile=$infasta . $thresh;
print "filtered fasta writing to $outfile\n";
my $out = Bio::AlignIO->new(-file => ">$outfile", -format => 'fasta');
$out->write_aln($aln);