-
Notifications
You must be signed in to change notification settings - Fork 53
/
parlamintp2conllu.pl
executable file
·149 lines (135 loc) · 5.38 KB
/
parlamintp2conllu.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env perl
# Convert ParlaMint .ana files to CoNLL-U and validate them
# Also produces meta-data .tsv files
# Tomaž Erjavec <tomaz.erjavec@ijs.si>
# License: GNU GPL
use warnings;
use utf8;
use open ':utf8';
binmode(STDIN, ':utf8');
binmode(STDOUT, ':utf8');
binmode(STDERR, ':utf8');
sub usage
{
print STDERR ("Usage: parlamint2conllu.pl <InputDirectory> <OutputDirectory>\n");
print STDERR (" Converts ParlaMint .ana files in the <InputDirectory> to\n");
print STDERR (" .conllu and -meta.tsv files in the <OutputDirectory>\n");
print STDERR (" Also validates the .conllu agains UD validations script\n");
}
use FindBin qw($Bin);
use File::Spec;
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
my $tempdirroot = "$Bin/tmp";
my $DIR = tempdir(DIR => $tempdirroot, CLEANUP => 1);
$inDir = File::Spec->rel2abs(shift);
$outDir = File::Spec->rel2abs(shift);
$Para = 'parallel --gnu --halt 0 --jobs 10';
$Saxon = "java -jar $Bin/bin/saxon.jar";
$scriptValid = "$Bin/bin/tools/validate.py";
$scriptConvert = "$Bin/parlamint2conllu.xsl";
$scriptMeta = "$Bin/parlamint2meta.xsl";
#This should be somehow factorised out!!
$country2lang{'AT'} = 'de';
$country2lang{'BA'} = 'sr'; # Should be 'bs', but UD does not support it!
$country2lang{'BE'} = 'nl, fr';
$country2lang{'BG'} = 'bg';
$country2lang{'CZ'} = 'cs';
$country2lang{'DE'} = 'de';
$country2lang{'DK'} = 'da';
$country2lang{'EE'} = 'et';
$country2lang{'ES'} = 'es';
$country2lang{'ES-AN'} = 'es';
$country2lang{'ES-CT'} = 'ca, es';
$country2lang{'ES-GA'} = 'gl';
$country2lang{'ES-PV'} = 'eu, es';
$country2lang{'FI'} = 'fi';
$country2lang{'FR'} = 'fr';
$country2lang{'GB'} = 'en';
$country2lang{'GR'} = 'el';
$country2lang{'HR'} = 'hr';
$country2lang{'HU'} = 'hu';
$country2lang{'IS'} = 'is';
$country2lang{'IT'} = 'it';
$country2lang{'LT'} = 'lt';
$country2lang{'LV'} = 'lv';
$country2lang{'NL'} = 'nl';
$country2lang{'NO'} = 'no';
$country2lang{'PL'} = 'pl';
$country2lang{'PT'} = 'pt';
$country2lang{'RO'} = 'ro';
$country2lang{'RS'} = 'sr';
$country2lang{'SE'} = 'sv';
$country2lang{'SI'} = 'sl';
$country2lang{'SK'} = 'sk';
$country2lang{'TR'} = 'tr';
$country2lang{'UA'} = 'uk, ru';
# Fake country for testing:
$country2lang{'XX'} = 'hr';
print STDERR "INFO: Converting directory $inDir\n";
my $rootAnaFile = '';
my @compAnaFiles = ();
$inDir =~ s|[^/]+\.xml$||; # If a specific filename is given, get rid of it
$corpusFiles = "$inDir/*.ana.xml $inDir/*/*.ana.xml";
foreach $inFile (glob($corpusFiles)) {
if ($inFile =~ m|ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?\.ana\.xml|) {$rootAnaFile = $inFile}
elsif ($inFile =~ m|ParlaMint-[A-Z]{2}(?:-[A-Z0-9]{1,3})?(?:-[a-z]{2,3})?_.+\.ana\.xml|) {push(@compAnaFiles, $inFile)}
}
my ($country, $MT) = $rootAnaFile =~ /ParlaMint-([A-Z]{2}(?:-[A-Z0-9]{1,3})?)(?:-([a-z]{2,3}))?\.ana\.xml/
or die "FATAL ERROR: Can't find country code in root file $rootAnaFile!\n";
if (defined $MT) {$langs = $MT}
elsif (exists($country2lang{$country})) {$langs = $country2lang{$country}}
else {
die "FATAL ERROR: Can't find mapping between country code and language(s): ".
"pls. add \$country2lang{'$country'} to parlamintp2conllu.pl!\n"
}
#Store all files to be processed in $fileFile
$fileFile = "$DIR/files.lst";
open(TMP, '>:utf8', $fileFile);
foreach $inFile (@compAnaFiles) {
print TMP "$inFile\n"
}
close TMP;
`mkdir $outDir` unless -e "$outDir";
`rm -f $outDir/*-meta.tsv`;
`rm -f $outDir/*.conllu`;
#For MTed corpora output only en metadata, for native, both xx and en
if ($MT) {@outLangs = ('en')} else {@outLangs = ('xx', 'en')}
# For orig corpora make ParlaMint-XX-meta.tsv in corpus language and ParlaMint-XX-meta-en.tsv in English
# For MTed corpora we produce ParlaMint-XX-en-meta.tsv in English
foreach my $outLang (@outLangs) {
my $outSuffix;
if ($MT and $outLang eq 'xx') {}
elsif ($MT and $outLang eq 'en') {$outSuffix = "-meta.tsv"}
elsif ($outLang eq 'xx') {$outSuffix = "-meta.tsv"}
elsif ($outLang eq 'en') {$outSuffix = "-meta-en.tsv"}
if ($outSuffix) {
$command = "$Saxon meta=$rootAnaFile" .
" out-lang=$outLang" .
" -xsl:$scriptMeta {} > $outDir/{/.}$outSuffix";
`cat $fileFile | $Para '$command'`;
}
}
`rename 's/\.ana//' $outDir/*-meta*.tsv`;
# Produce common CoNLL-U, even if we have more languages in a corpus
if ($langs !~ /,/) {$checkLang = $langs}
else {($checkLang) = $langs =~ /(.+?),/}
$command = "$Saxon meta=$rootAnaFile -xsl:$scriptConvert {} > $outDir/{/.}.conllu";
`cat $fileFile | $Para '$command'`;
`rename 's/\.ana//' $outDir/*.conllu`;
$command = "python3 $scriptValid --lang $checkLang --level 1 {}";
`ls $outDir/*.conllu | $Para '$command'`;
$command = "python3 $scriptValid --lang $checkLang --level 2 {}"
unless defined $MT; #MTed corpora do not have syntactic parses
`ls $outDir/*.conllu | $Para '$command'`;
# Now produce CoNLL-Us for separate langauges, if we have them
if ($langs =~ /,/) {
foreach $lang (split(/,\s*/, $langs)) {
$command = "$Saxon meta=$rootAnaFile seg-lang=$lang -xsl:$scriptConvert {} > $outDir/{/.}-$lang.conllu";
`cat $fileFile | $Para '$command'`;
`rename 's/\.ana//' $outDir/*.conllu`;
$command = "python3 $scriptValid --lang $lang --level 1 {}";
`ls $outDir/*.conllu | $Para '$command'`;
$command = "python3 $scriptValid --lang $lang --level 2 {}";
`ls $outDir/*.conllu | $Para '$command'`;
}
}