forked from StatGenPRD/STOPGAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFANTOM5dataParser.pl
84 lines (77 loc) · 2.12 KB
/
FANTOM5dataParser.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env perl
use Modern::Perl;
use autodie;
use File::HomeDir;
use Text::Autoformat;
# correlation threshold
my $corr = 0;
# significance threshold
my $pval = 1e-5;
# directories
my $original = File::HomeDir->my_home . "/Resources/FunctionalGenomics/data/FANTOM5/original/";
my $processed = File::HomeDir->my_home . "/Resources/FunctionalGenomics/data/FANTOM5/processed/";
# clean up
unlink glob $processed . "CRM/*";
# enhancer to gene associations
my %e2g;
# read enhancer -> gene association file
open my $fh, "<", $original . "Enhancers/enhancer_tss_associations.bed";
while (<$fh>) {
chomp;
next if /^track name=/;
next if /^#/;
my @flds = split /\t/;
my @flds2 = split /;/, $flds[3];
my $enhancer = $flds2[0];
my $gene = $flds2[2];
my $r;
if (defined $flds2[3]) {
$r = $flds2[3] =~ s/R://r;
}
my $fdr;
if (defined $flds2[4]) {
$fdr = $flds2[4] =~ s/FDR://r;
}
if (defined $gene && defined $r && defined $fdr && $r > $corr && $fdr < $pval) {
$e2g{$enhancer}{$gene}++;
}
}
close $fh;
# open enhancer directory
opendir my $dh, $original . "Enhancers/";
# get cell-specific and tissue-specific filenames
my @files = grep { /^(CL|UBERON):\d{7}_\w+_expressed_enhancers\.bed$/ } readdir($dh);
closedir $dh;
# process cell/tissue-specific enhancers
for my $infile (@files) {
my $name = $infile =~ s/^(CL|UBERON):\d{7}_(\w+)_expressed_enhancers.bed$/$2/r;
$name = autoformat $name, { case => 'title' }; # this adds newlines apparently
$name =~ s/_|\n//g;
$name .= ".bed";
$infile = $original . "Enhancers/" . $infile;
my $outfile = $processed . "CRM/" . $name;
open my $in, "<", $infile;
open my $out, ">", $outfile;
while (<$in>) {
chomp;
next if /^track name=/;
next if /^#/;
my @flds = split /\t/;
my $chr = $flds[0];
my $start = $flds[1];
my $end = $flds[2];
my $enhancer = $flds[3];
if (exists $e2g{$enhancer}) {
my @genes;
for my $gene (keys %{ $e2g{$enhancer} }) {
push @genes, $gene;
}
print $out $chr . "\t" . $start . "\t" . $end . "\t" . join(",", @genes) . "\n";
}
else {
print $out $chr . "\t" . $start . "\t" . $end . "\t" . "." . "\n";
}
}
close $in;
close $out;
}