-
Notifications
You must be signed in to change notification settings - Fork 0
/
cldfbench_gasttdir.py
196 lines (166 loc) · 6.9 KB
/
cldfbench_gasttdir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import pathlib
import re
import sys
import unicodedata
from itertools import zip_longest
from cldfbench import CLDFSpec, Dataset as BaseDataset
from pybtex.database import parse_string
def td_to_tab(cell):
cell = cell.replace('\t', '\\t')
cell = cell.replace('</td><td>', '\t')
return cell
def html_cleanup(cell):
cell = cell.strip()
cell = re.sub('<[^<>]*>', '', cell)
cell = cell.replace('<', '<')
cell = cell.replace('>', '>')
cell = re.sub(' ?', ' ', cell)
# some remnant of a non-utf encoding???
cell = re.sub('’?', '’', cell)
cell = cell.replace('Å', 'Å')
cell = cell.replace('ä', 'ä')
cell = cell.replace('ç', 'ç')
cell = cell.replace('ö', 'ö')
cell = cell.replace('ü', 'ö')
cell = re.sub(r'&#(\d+);?', lambda m: chr(int(m.group(1))), cell)
# why not..
cell = unicodedata.normalize('NFC', cell)
return cell
def make_example_row(langid_by_name, example):
language_name = example['language'].lower()
language_name = language_name.replace('nahuatlx', 'nahuatl')
language_name = language_name.replace('zapo´tec', 'zapotec')
language_name = language_name.replace('sewdish', 'swedish')
analysed_word = example['original'].split('\t')
glosses = example['gloss'].split('\t')
if example['comments'] != '--':
comment = example['comments']
else:
comment = ''
return {
'ID': example['Nr'],
'Language_ID': langid_by_name[language_name],
'Primary_Text': ' '.join(analysed_word),
'Analyzed_Word': analysed_word,
'Gloss': glosses,
'Translated_Text': example['translation'],
'Comment': comment,
'POV': example['pov'],
# TODO do sources properly
'Citation': example['source'],
}
def render_example(example):
words = example['Analyzed_Word']
glosses = example['Gloss']
id_width = len(example['ID'])
widths = [max(len(w), len(g)) for w, g in zip(words, glosses)]
padded_words = [
word.ljust(width)
for word, width in zip_longest(words, widths, fillvalue=0)]
padded_glosses = [
gloss.ljust(width)
for gloss, width in zip_longest(glosses, widths, fillvalue=0)]
return '({}) {}\n{} {}'.format(
example['ID'],
' '.join(padded_words).rstrip(),
' ' * id_width,
' '.join(padded_glosses).rstrip())
def warn_about_glosses(example_table):
mismatched_examples = [
example
for example in example_table
if len(example['Analyzed_Word']) != len(example['Gloss'])]
if mismatched_examples:
print("ERROR: Misaligned glosses in examples:", file=sys.stderr)
for example in mismatched_examples:
print(file=sys.stderr)
print(render_example(example), file=sys.stderr)
class Dataset(BaseDataset):
dir = pathlib.Path(__file__).parent
id = "gasttdir"
def cldf_specs(self): # A dataset must declare all CLDF sets it creates.
return CLDFSpec(
dir=self.cldf_dir,
module="StructureDataset",
metadata_fname='cldf-metadata.json')
def cmd_download(self, args):
"""
Download files to the raw/ directory. You can use helpers methods of `self.raw_dir`, e.g.
>>> self.raw_dir.download(url, fname)
"""
# # note to self: both openpyxl and xlrd failed to read the spreadsheets
# # because the files actually just contain an html table (based on
# # some schema defined by ms?) instead of whatever these libraries
# # actually expect an Excel sheet to contain…
# # So, I just ended up converting the files manually in libreoffice…
# self.raw_dir.xls2csv('tdir.examples.xls')
# self.raw_dir.xls2csv('tdir.glosses.xls')
# self.raw_dir.xls2csv('tdir.languages.xls')
# self.raw_dir.xls2csv('tdir.references.xls')
def cmd_makecldf(self, args):
"""
Convert the raw data to a CLDF dataset.
>>> args.writer.objects['LanguageTable'].append(...)
"""
original_values = self.raw_dir.read_csv('tdir.languages.csv', dicts=True)
original_values = [
{html_cleanup(col): html_cleanup(cell) for col, cell in row.items()}
for row in original_values]
language_table = self.etc_dir.read_csv('languages.csv', dicts=True)
parameter_table = self.etc_dir.read_csv('parameters.csv', dicts=True)
for parameter in parameter_table:
parameter['Grammacodes'] = re.split(r'\s*,\s*', parameter.get('Grammacodes', ''))
example_table = self.raw_dir.read_csv('tdir.examples.csv', dicts=True)
example_table = [
{col: html_cleanup(td_to_tab(cell)) for col, cell in row.items()}
for row in example_table]
sources = parse_string(
self.raw_dir.read('tdir.references.bib'), 'bibtex')
language_sources = {
row['Glottocode']: [
trimmed
for source in row.get('Source', '').split(';')
if (trimmed := source.strip())]
for row in original_values}
for lg in language_table:
lg['Source'] = language_sources.get(lg['Glottocode']) or []
langid_by_name = {
row['Original_Name'].lower(): row['ID']
for row in language_table}
example_table = [
make_example_row(langid_by_name, example)
for example in example_table
if example['language'] != 'xxx']
warn_about_glosses(example_table)
langid_by_glottocode = {
row['Glottocode']: row['ID'] for row in language_table}
value_table = [
{
'ID': '{}-{}'.format(
langid_by_glottocode[value['Glottocode']],
param['ID']),
'Language_ID': langid_by_glottocode[value['Glottocode']],
'Parameter_ID': param['ID'],
'Value': value[param['ID']].strip(),
'Comment': value.get(param.get('Comment_Col')) or '',
}
for value in original_values
for param in parameter_table
if value.get(param['ID'], '').strip()]
args.writer.cldf.add_component(
'LanguageTable',
'http://cldf.clld.org/v1.0/terms.rdf#source')
args.writer.cldf.add_component(
'ParameterTable',
{
'name': 'Grammacodes',
'datatype': 'string',
'separator': ';',
'dc:extent': 'multivalued',
})
args.writer.cldf.add_component('ExampleTable', 'POV', 'Citation')
args.writer.objects['LanguageTable'] = language_table
args.writer.objects['ParameterTable'] = parameter_table
args.writer.objects['ValueTable'] = value_table
args.writer.objects['ExampleTable'] = example_table
args.writer.cldf.add_sources(sources)