-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentropak.pl
executable file
·417 lines (354 loc) · 12.6 KB
/
entropak.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
#!/usr/bin/perl
# Peter Lazorchak
# lazorchakp@gmail.com
# 6/13/2017
=pod
DESCRIPTION
This script generates CLUMP files and runs them.
DEPENDENCIES
Unless specified, clump_gen.pl will look for the following in $PATH.
Required programs:
estpost_split
CLUMPAK.pl
note that this has many of its own dependencies - consult online manual
one particularly frustrating one is the GD module. To install this,
you first must install libgd (gdlib-config must be in your $PATH)
BestKByEvanno.pl
Required if using input files option b (below):
popfile_gen.pl
Required modules:
hdf5
gsl
CLUMPAK.pl and BestKByEvanno.pl rely on modules in the clumpak distribution.
Moving these scripts out of the clumpak directory is not recommended. Also,
all clumpak scripts must be run out of this directory, so this script cd's into
the specified clumpak directory immediately before executing.
FILES
Required input files:
Entropy output - *_k2_1.hdf5, *_k2_2.hdf5, *_k3_1.hdf5, *_k3_2.hdf5 etc
ONE of the following (a OR b, recommended = a):
a) Populations file - single column, each line with the population
corresponding to an individual. These must be in the same order
as the original vcf file's individuals.
{
<pop_for_ind0>
<pop_for_ind1>
...
}
b) Individual names file
{
<ind0>
<ind1>
...
}
AND
Grant population map file - for flies from Grant
{
MID<ind0> Grant<App|Haw>
MID<ind1> Grant<App|Haw>
...
}
note that option b relies on very specific regular expressions to
determine populations based on individuals' names. Option a is thus
recommended, but b is more convenient if you know popfile_gen.pl works.
Optional input files:
Colors file - list of colors to be used by distruct (see distruct manual)
drawparams file - visual specifications for distruct (see distruct manual)
The default is contained within the distruct directory in clumpak.
It may be simpler to modify this file than to specify a new one. Note
when using this file that clumpak makes many modifications and ignores
certain options (such as K). For INDIVWIDTH, use a negative value to
allow clumpak to calculate page-fitting width.
labels file - list of population order for the final distruct graphs.
Excess populations in this file are permitted, and extras will be
ignored. Thus, for a group of datasets (ex. Clines2017), only one
labels file is needed. This file is NOT THE SAME as the labels_file
from clumpak (or INFILE_LABEL_BELOW from distruct). It is used to
generate this file.
{
<pop0>
<pop1>
...
}
=cut
use warnings;
use strict;
use Getopt::Long qw(:config no_ignore_case);
use File::Basename;
use File::Path qw(make_path);
use File::Spec;
&author;
my $indir;
my $outdir;
my $popfile;
my $namefile;
my $grantfile;
my $mink=2;
my $maxk=3;
my $burnin=0;
my $colorsfile;
my $drawparams;
my $labelsfile;
my $estpost = 'estpost_split';
my $clumpak = "$ENV{HOME}/bin/clumpak";
my $popfile_gen = 'popfile_gen.pl';
GetOptions(
'i=s' => \$indir,
'o=s' => \$outdir,
'p=s' => \$popfile,
'n=s' => \$namefile,
'g=s' => \$grantfile,
'k=i' => \$mink,
'K=i' => \$maxk,
'b=i' => \$burnin,
'c=s' => \$colorsfile,
'd=s' => \$drawparams,
'l=s' => \$labelsfile,
'e=s' => \$estpost,
'u=s' => \$clumpak,
'f=s' => \$popfile_gen,
'h' => \&usage,
);
&usage unless (defined $indir && defined $outdir);
&usage unless (defined $popfile || (defined $namefile && defined $grantfile));
my $nks = $maxk - $mink + 1;
# input dir
if (! -e $indir) {
die ("\nCan't access input directory: $indir\n");
}
$indir = File::Spec->rel2abs($indir);
# output dir
if (! -e $outdir) {
eval {make_path($outdir)}
or die ("\nCan't create output directory: $outdir\n");
} else { # check for empty directory
opendir DIR, $outdir or die("\nCan't open output directory: $outdir\n");
my @contents = readdir DIR;
die ("\nOutput directory must be empty\n") if (scalar @contents > 2); # . and ..
closedir DIR;
}
$outdir = File::Spec->rel2abs($outdir);
# clumpak dir
die "Can't find clumpak directory: $clumpak\n" unless (-d $clumpak);
$clumpak = File::Spec->rel2abs($clumpak);
my $prerunDir = $outdir."/prerun";
my $clumpoutDir = $outdir."/clumpak_out";
my $bestKout = $outdir."/bestk_out";
-d $prerunDir or mkdir $prerunDir
or die "Cannot make directory: $prerunDir\n";
-d $clumpoutDir or mkdir $clumpoutDir
or die "Cannot make directory: $clumpoutDir\n";
-d $bestKout or mkdir $bestKout
or die "Cannot make directory: $bestKout\n";
# check for popfile, or namefile and grantfile
if (defined $popfile) {
if (! -e $popfile) {
die "\nCan't find populations file: $popfile\n\n";
}
} elsif (! -e $namefile) {
die "\nCan't find names file: $namefile\n\n";
} elsif (! -e $grantfile) {
die "\nCan't find Grant populations map file: $grantfile\n\n";
}
# set up the popfile if it doesn't exist, and get nind by counting its lines
if (defined $popfile) {
$popfile = File::Spec->rel2abs($popfile);
} else {
$namefile = File::Spec->rel2abs($namefile);
$grantfile = File::Spec->rel2abs($grantfile);
$popfile = "$outdir/populations_file.txt";
0 == system "$popfile_gen -i $namefile -g $grantfile -o $popfile"
or die "Failed to produce populations file\n";
}
my $nind = `wc -l < $popfile`;
chomp $nind;
# get the names of all input files
opendir DIR, $indir or die("\nCan't open input directory: $indir\n");
my @hdf5files = grep(/\.hdf5$/, readdir DIR);
foreach my $h (@hdf5files) {
$h = "$indir/$h";
}
closedir DIR;
decodeHDF5($mink, $maxk, $prerunDir, @hdf5files);
rearrange_output($mink, $maxk, $prerunDir, $nind);
0 == system "zip -r -j $outdir/prerun.zip $prerunDir/*_K*_*.dat" or die "$!\n";
# set up clumpak
my $clumpCommand = "./CLUMPAK.pl --id $$ --inputtype admixture "
."--dir $clumpoutDir --file $outdir/prerun.zip --indtopop $popfile ";
$clumpCommand .= "--colors ".File::Spec->rel2abs($colorsfile)." "
if (defined $colorsfile);
$clumpCommand .= "--drawparams ".File::Spec->rel2abs($drawparams)." "
if (defined $drawparams);
# convert the labels file to the proper format
if (defined $labelsfile) {
my $labelsfile_conv = &convert_labelsfile($outdir, $popfile, $labelsfile);
$clumpCommand .= "--labels ".File::Spec->rel2abs($labelsfile_conv)." ";
}
# cd into clumpak directory before running
# all filenames have absolute paths so this will not cause problems
print "Running CLUMPAK\n";
0 == system "cd $clumpak; $clumpCommand"
or die "Failed to run CLUMPAK ($!)\n";
# execute BestKByEvanno
print "Running BestKByEvanno\n";
0 == system "cd $clumpak; ./BestKByEvanno.pl --id $$ --dir $bestKout "
."--file $outdir/log_prob_file.txt --inputtype lnprobbyk "
or die "Failed to run BestKByEvanno ($!)\n";
# ============================== SUBROUTINES ==================================
# run estpost_split
sub decodeHDF5 {
my $mink=shift;
my $maxk=shift;
my $prerunDir=shift;
my @hdf5files=@_;
# Write a single usable csv for each group of hdf5 files with the same k
foreach my $k ($mink..$maxk) {
my @hdf5k = grep(/\_k$k(\_[0-9]+)*\.hdf5$/, @hdf5files);
my $input = join(' ', sort (@hdf5k));
my $outfile = "$prerunDir/".basename($hdf5k[0]);
$outfile =~ s/(\_[0-9]+)*\.hdf5$/\.q_postest_split.csv/g;
$outfile =~ s/\.q_postest/\.b$burnin\.q_postest/g if ($burnin > 0);
# generate intermediate files. Ideally estpost would output the correct
# format (TODO future project), but for now we output them, then modify
# Additionally, create the log probabilities file.
system("$estpost -b $burnin -o $outfile $input "
."| awk '{print \"$k\t\"\$1}' >> $prerunDir/../log_prob_file.txt");
}
}
=pod
Reshapes the data into multiple output files.
Originally, (after the header), the data is formatted as:
FILENAME: <info>_kP<moreinfo>_split.csv
q_ind_0_pop_0,mean1,mean2...meanM
...
q_ind_N-1_pop_0,mean1,mean2...meanM
q_ind_0_pop_1,mean1,mean2...meanM
...
q_ind_N-1_pop_P-1,mean1,mean2...meanM
for M reps, N individuals, and P populations (k = P)
The new format is (for example):
FILENAME: <info>_K3_2.dat
ind0_pop0_mean2 ind0_pop1_mean2 ind0_pop2_mean2
ind1_pop0_mean2 ind1_pop1_mean2 ind1_pop2_mean2
...
indN-1_pop0_mean2 indN-1_pop1_mean2 indN-1_pop2_mean2
Note that the output files are whitespace delimited
This is the format required by CLUMPAK
=cut
sub rearrange_output {
my $mink = shift;
my $maxk = shift;
my $prerunDir = shift;
my $nind = shift;
opendir(DIR, $prerunDir) or die("Cannot open directory: $prerunDir\n");
my @csvfiles=grep(/_k\d+.*_split.csv$/, readdir(DIR));
foreach my $filename (@csvfiles) {
$filename="$prerunDir/$filename";
}
closedir(DIR);
foreach my $k ($mink..$maxk) {
my ($currfile) = grep(/_k$k/, @csvfiles);
die "$!\n(Did you forget to load the gsl or hdf5 modules?)\n"
if (!defined $currfile);
my $outfile ="$prerunDir/".basename($currfile);
$outfile =~ s/_k$k.+_split.csv$/_K$k/g; #will eventually add _<rep>.dat
# load all means into the memory
open(FILE, "$currfile") or die ("\nCan't open file: $currfile\n");
my $header=<FILE>;
chomp($header);
my ($nreps) = $header =~ /(\d+)$/;
my @means;
my $currind=0;
while (<FILE>) {
chomp;
s/^q_ind_\d+_pop_\d+,//g; # remove individual id
$means[$currind] .= "$_,";
$currind = ++$currind % $nind;
}
close(FILE);
unlink glob ("$currfile");
foreach my $rep (1..$nreps) {
open(OUT, ">$outfile"."_$rep.dat")
or die("\nCan't open file: $outfile"."_$rep.dat\n");
--$rep;
foreach my $ind (0..($nind - 1)) {
foreach my $pop (0..($k - 1)) {
my @indprobs = split(/,/,$means[$ind]);
print OUT "$indprobs[$rep + $nreps * $pop] ";
}
print OUT "\n";
}
close(OUT);
}
}
}
sub convert_labelsfile {
my ($outdir, $popfile, $labelsfile) = @_;
open(POPS, $popfile)
or die "Failed to open pops file during labels file conversion\n";
my %popids;
my $currid = 1;
while (<POPS>) {
chomp;
if (!exists $popids{$_}) {
$popids{$_} = $currid;
++$currid;
}
}
close POPS;
open(LABELS, $labelsfile) or die
"Failed to open original labels file during labels file conversion\n";
my $labelsfile_conv = $outdir."/conv_".basename($labelsfile);
open(CONV, ">$labelsfile_conv") or die
"Failed to open new labels file during labels file conversion\n";
my $linecount = 0;
my $npops = keys %popids;
while (<LABELS>) {
chomp;
if (exists $popids{$_}) {
print CONV "$popids{$_}\t$_\n";
delete $popids{$_};
++$linecount
}
}
if ($linecount < $npops) {
warn "Fatal: Too few valid populations in labels file\n"
."Expected: $npops\tFound $linecount\n";
warn "The following individuals were not found in the labels file:\n";
foreach (keys %popids) {
warn "$_\n";
}
exit 1;
}
close LABELS;
return $labelsfile_conv;
}
sub author {
print "\n"
."#########################################\n"
." ".basename($0)."\n"
." Peter Lazorchak\n"
." lazorchakp\@gmail.com\n"
."#########################################\n\n"
}
sub usage {
print " Usage:\n"
." ".basename($0)."\n"
." -i <input directory (with *_k[0-9]+(\_[0-9]+){0,1}.hdf5 files)>\n"
." -o <output directory>\n"
." -p <populations file>\n"
." -n <names file> required if no populations file\n"
." -g <Grant population map file> required if no populations file\n"
." -k <min. K> [2]\n"
." -K <max. K> [3]\n"
." -b <extra burnin> [0]\n"
." -c <colors file> [none]\n"
." -d <drawparams file> [none]\n"
." -l <labels file> [none]\n"
." -e <path to estpost> [estpost_split]\n"
." -u <path to clumpak directory> [\$HOME/bin/clumpak]\n"
." -f <path to popfile_gen.pl> [popfile_gen.pl]\n"
." -h <show this help>\n"
."\n";
exit;
}