-
Notifications
You must be signed in to change notification settings - Fork 53
/
parlamintp-tei2vert-xx.pl
executable file
·76 lines (64 loc) · 2.17 KB
/
parlamintp-tei2vert-xx.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env perl
# A spin-off of parlamintp-tei2vert.pl to produce vertical files with en metadata
use warnings;
use utf8;
use open ':utf8';
use FindBin qw($Bin);
use File::Spec;
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
my $tempdirroot = "$Bin/tmp";
my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1);
$inDir = File::Spec->rel2abs(shift);
$outDir = File::Spec->rel2abs(shift);
binmode(STDERR, 'utf8');
$Para = 'parallel --gnu --halt 2 --jobs 10';
$Saxon = "java -jar $Bin/bin/saxon.jar";
$TEI2VERT = "$Bin/parlamint2xmlvert.xsl";
$POLISH = "$Bin/parlamint-xml2vert.pl";
binmode(STDERR,'utf8');
print STDERR "INFO: Converting directory $inDir\n";
my $rootAnaFile = '';
my @compAnaFiles = ();
$inDir =~ s|[^/]+\.xml$||; # If a specific filename is given, get rid of it
$corpusFiles = "$inDir/*.ana.xml $inDir/*/*.ana.xml";
foreach $inFile (glob($corpusFiles)) {
if ($inFile =~ m|ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?\.ana\.xml|) {
$rootAnaFile = $inFile
}
elsif ($inFile =~ m|ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?_.+\.ana\.xml|) {
push(@compAnaFiles, $inFile)
}
}
die "FATAL ERROR: Cannot find root file in $inDir!\n"
unless $rootAnaFile;
die "FATAL ERROR: Cannot find component files in $inDir!\n"
unless @compAnaFiles;
`mkdir $outDir` unless -e "$outDir";
`rm -f $outDir/*.vert`;
#Store all files to be processed in $fileFile
$fileFile = "$DIR/files.lst";
open(TMP, '>:utf8', $fileFile);
foreach $inFile (@compAnaFiles) {
print TMP "$inFile\n"
}
close TMP;
$command = "$Saxon meta=$rootAnaFile out-lang=en " .
"-xsl:$TEI2VERT {} | $POLISH > $outDir/{/.}.vert";
`cat $fileFile | $Para '$command'`;
`rename 's/\.ana//' $outDir/*.vert`;
&dirify($outDir);
#If a directory has more than $MAX files, store them in year directories
sub dirify {
my $MAX = 1; #In ParlaMint II we always put them in year directories
my $inDir = shift;
my @files = glob("$inDir/*");
if (scalar @files > $MAX) {
foreach my $file (@files) {
if (my ($year) = $file =~ m|ParlaMint-.+?_(\d\d\d\d)|) {
my $newDir = "$inDir/$year";
mkdir($newDir) unless -d $newDir;
`mv $file $newDir`;
}
}
}
}