-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_id_maps.pl
75 lines (61 loc) · 1.8 KB
/
make_id_maps.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/perl -w
use strict;
foreach my $f (qw[genome_id.map protein_genome_lengths.txt]) {
die "$f exists, not overwriting" if (-e $f);
}
my ($idmap, $real, $gennamef)=@ARGV;
unless ($idmap && $real && $gennamef) {
die "Usage: $0 <id.map> <tuple of real id, genome id, length> <tuple of genomeid, real name, abbreviated name for tree>";
}
my %id;
open(IN, $idmap) || die "Can't open $idmap";
while (<IN>) {
chomp;
my @a=split /\t/;
$id{$a[0]}=$a[1];
}
close IN;
my %genomename;
open(IN, $gennamef) || die "can't open $gennamef";
while (<IN>) {
chomp;
my ($id, $full, $abbr)=split /\t/;
my $used;
if (defined $abbr) {
$genomename{$id}=$abbr;
$used = $abbr;
} else {
print STDERR "Warning. No abbreviation for $id, so using $id as the name!\n";
$genomename{$id}=$id;
$used = $id;
}
if ($id =~ s/\.\d+$//) {
print STDERR "ADding $id\n";
# we removed the version from this. So we store it too
$genomename{$id} = $used;
}
}
open(IN, $real) || die "Can't open $real";
open(OUT, ">protein_genome_lengths.txt") || die "Can't write to protein_genome_lengths.txt";
my %genome; my $g=0;
while (<IN>) {
chomp;
my ($prot, $gen, $len)=split /\t/;
unless (defined $genome{$gen}) {$genome{$gen}=++$g}
unless ($id{$prot}) {die "no id for $prot"}
print OUT join("\t", $genome{$gen}, $id{$prot}, $len), "\n";
}
close OUT;
open(GEN, ">genome_id.map") || die "Can't open genome_id.map";
foreach my $gen (sort {uc($genomename{$a}) cmp uc($genomename{$b})} keys %genome) {
unless (defined $genome{$gen}) {
print STDERR "No genome for $gen ($genomename{$gen})\n";
$genome{$gen} = "XXX";
}
unless (defined $genomename{$gen}) {
print STDERR "No genome for $gen ($genome{$gen})\n";
$genomename{$gen} = "XXX";
}
print GEN $genome{$gen}, "\t", $genomename{$gen}, "\n";
}
close GEN;