-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathfasta2popgen.pl
executable file
·257 lines (215 loc) · 8.62 KB
/
fasta2popgen.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#!/usr/bin/perl
#fasta2popgen.pl
#script to calculate various pop gen metrics per position from aligned fasta entries
#Megan Supple
#last modified 14 June 2013
#usage fasta2popgen.pl [options] <in.fasta> <in.pops>
#options are to specifiy the analysis
#fst (based on Weir 1996, Genetic Data Analysis II)
#3 level hierarchical fst (by population, then pheontype) (based on Weir 1996, Genetic Data Analysis II)
#genotype by phenotype association (two tailed Fisher's exact test)
#measures of selection (heterozygosity, pi, segregating sites, Tajima's D, D*, singletons, SNP type (private vs shared)
#absolute genetic divergence (dxy) (Nei 1987, eq 10.20)
#requires BioPerl, FormatGenos.pm, PopGen.pm, PopStatsHierarchy.pm, PopStatsModified.pm
#produces per position values for each analysis (see PopGen modules for details)
use lib $ENV{PERL5LIB};
use strict;
use warnings;
use FormatGenos;
use PopGen;
use Getopt::Long;
use List::MoreUtils qw(firstidx uniq);
use Data::Dumper;
my $usage = "Usage: fasta2popgen.pl [options] <in.fasta> <in.pops>
options:
<in.fasta> an aligned fasta file
<in.pops> a space delimited text file with information on each sample \n\t\t\t (column 1: population, column 2: phenotype, column 3: name in input fasta)
-fst calculate Fst (samples grouped by pheotype, ignoring population information)
-fst3level calculate 3 level fst (level 1: population, level 2: phenotype), returns fst based on pheontype
-assoc calculate genotype by phenotype association (samples grouped by pheotype, ignoring population information)
-sel calculate various measures used to test for selection (samples grouped by phenotype, ignoring population information)
-dxy calculate absolute genetic divergence
";
my $calc_fst;
my $calc_fst3;
my $calc_gxp;
my $calc_sel;
my $calc_dxy;
GetOptions ( "fst" => \$calc_fst,
"fst3level" => \$calc_fst3,
"assoc" => \$calc_gxp,
"sel" => \$calc_sel,
"dxy" => \$calc_dxy,
);
#read in command line argument
my ($infasta,$inpops)=@ARGV;
die "please provide a fasta file and a sample list file\n$usage" unless (@ARGV == 2);
die "please specify an analysis\n$usage" unless ($calc_fst || $calc_fst3 || $calc_gxp || $calc_sel || $calc_dxy);
########################################################################
###parse input files####################################################
########################################################################
#initialize hash for samples and population information
my %samples = ();
#open and read in input file with sample and population information
open(INPOPS, $inpops)||die "can't open input population file. $!\n";
my @input=<INPOPS>;
close INPOPS;
my $n_samples=@input;
#process each sample's population information
for (my $i=0; $i<@input; $i++)
{
#split information
my @info=split(" ",$input[$i]);
#enter information into the sample hash
$samples{$info[2]}={'pop' => $info[0], 'pheno' => $info[1]};
#print "population= $samples{$info[2]}{'pop'}\n";
#print "phenotype= $samples{$info[2]}{'pheno'}\n";
}
#parse input fasta
my ($seqs_p,$contig,$size)=FormatGenos::fasta2hash($infasta);
#########################################################################
###add sequence info to data structure###################################
#########################################################################
#initialize pointers to population and phenotype groupings
my @pheno_names=(); #array with phenotype names
my @phenos;
my @pheno_header=();
my @pop_names=();
my @pops;
my @pop_header=();
my $n_total=0;
my @pheno_temp;
#process each fasta entry if the sample exists in the list
while (my ($seqid, $seq) = each(%$seqs_p))
{
#if the sample exist in the sample list
if (exists $samples{$seqid})
{
#get population and phenotype information
my $pop=$samples{$seqid}{'pop'};
my $pheno=$samples{$seqid}{'pheno'};
if (!defined $pop){die "sample $seqid does not exist in population input file $inpops\n"}
push(@pheno_temp, $pheno);
#process the entry
print "processing $seqid\n";
my $indiv_p=FormatGenos::seq2geno($seqid,$seq);
#if grouping "population" by pheontype (for fst, association, and selection)
if ($calc_fst || $calc_gxp || $calc_sel || $calc_dxy)
{
#create pheotype group (if it is new) and push into @phenos
if (!grep(/^$pheno$/,@pheno_names))
{
push(@pheno_names, $pheno);
push(@phenos, Bio::PopGen::Population->new(-name => $pheno));
}
#add individual to the phenotypic group by finding the index of the phenotype designation
$phenos[firstidx {$_ eq $pheno} @pheno_names]->add_Individual($$indiv_p);
}
#if grouping first by popualation and then by phenotype (fst3level)
if ($calc_fst3)
{
#create population (if it is new) and push into @pops
if (!grep(/^$pop\_$pheno$/,@pop_names))
{
push(@pop_names, "$pop\_$pheno");
push(@pops, Bio::PopGen::Population->new(-name => "$pop\_$pheno", -description => $pheno, -source => $pop));
}
#add individual to the population group by finding the index of the phenotype designation
$pops[firstidx {$_ eq "$pop\_$pheno"} @pop_names]->add_Individual($$indiv_p);
}
}
#if sample is not in list, ignore it
else {print "sample $seqid is not in the sample list\n"};
}
#make headers
#if grouping "population" by pheontype (for fst, association, and selection)
if ($calc_fst || $calc_gxp || $calc_sel || $calc_dxy)
{
@pheno_header=("####################\n");
#iterate over phenotypes
for (my $i=0; $i<@pheno_names; $i++)
{
#determine sample size for the phenotype and print to header
my $num_inds=$phenos[$i]->get_number_individuals;
push(@pheno_header,"#$pheno_names[$i]:n=$num_inds\n#");
#get a list of samples associated with the phenotype
my @inds=$phenos[$i]->get_Individuals();
foreach(@inds){push(@pheno_header, $_->unique_id);push(@pheno_header,","); }
push(@pheno_header, "\n");
$n_total+=$num_inds;
}
push(@pheno_header,"####################\n");
}
##if grouping by population, then by phenotype (for fst3level)
if ($calc_fst3)
{
my @pheno_uniq=uniq @pheno_temp;
@pop_header=("####################\n");
my @pheno_n=(0) x @pheno_uniq;
my @pheno_samples=();
#iterate over phenotypes
for (my $i=0; $i<@pop_names; $i++)
{
#determine sample size for the population and print to header
my $num_inds=$pops[$i]->get_number_individuals;
#get a list of samples associated with the population
my @inds=$pops[$i]->get_Individuals();
#get phenotype for the population
my @this_pop=split("_",$pop_names[$i]);
#record information by phenotype
for (my $j=0; $j<@pheno_uniq; $j++)
{
if ($this_pop[1] eq $pheno_uniq[$j])
{
#update population size
$pheno_n[$j]+=$num_inds;
#update sample list
foreach(@inds){$pheno_samples[$j].=$_->unique_id; $pheno_samples[$j].=",";}
#foreach(@inds){push(@pheno_samples[$j], $_->unique_id);push(@pop_header,","); }
}
}
}
for (my $i=0; $i<@pheno_uniq; $i++)
{
push(@pop_header,"#$pheno_uniq[$i]:n=$pheno_n[$i]\n#");
push(@pop_header, "$pheno_samples[$i]\n");
$n_total+=$pheno_n[$i];
}
push(@pop_header,"####################\n");
}
#check if all individuals in input file have been entered
#if ($n_total != $n_samples){die "Not all samples in $inpops are found in $infasta";}
###############################################################################
###calculate popgen estimates##################################################
###############################################################################
#calculate Fst
if ($calc_fst)
{
print "calculating Fst\n";
PopGen::Fst(\@pheno_names, \@phenos, \$contig, \$size, \@pheno_header);
}
#calculate 3 level Fst
if ($calc_fst3)
{
print "calculating 3 level Fst\n";
PopGen::Fst_3level(\@pop_names, \@pops, \$contig, \$size, \@pop_header);
}
#calculate genotype by phenotype association
if ($calc_gxp)
{
print "calculating association\n";
PopGen::association(\@pheno_names, \@phenos, \$contig, \$size, \@pheno_header);
}
#calculate selection measures
if ($calc_sel)
{
print "calculating measures of selection\n";
PopGen::selection(\@pheno_names, \@phenos, \$contig, \$size, \@pheno_header);
}
#calculate dxy
if ($calc_dxy)
{
print "calculating absolute genetic divergence\n";
PopGen::dxy(\@pheno_names, \@phenos, \$contig, \$size, \@pheno_header);
}
print "done!\n";