This repository has been archived by the owner on Sep 24, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocabularies.py
201 lines (190 loc) · 10.6 KB
/
vocabularies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from vocabulary import Vocabulary
from rdflib import Graph, URIRef, Namespace, RDF
import unicodedata
import re
class Vocabularies:
def __init__(self):
self.vocabularies = {}
def parse_vocabulary(self, vocabulary_code, graphs):
if vocabulary_code == "cilla":
graph = graphs["musa"]
else:
graph = graphs[vocabulary_code]
if vocabulary_code in ['ysa', 'musa', 'seko']:
language_codes = ['fi']
elif vocabulary_code in ['allars', 'cilla']:
language_codes = ['sv']
elif vocabulary_code in ['yso', 'yso-paikat', 'slm']:
language_codes = ['fi', 'sv']
vocabulary = Vocabulary(vocabulary_code, language_codes)
if vocabulary_code.startswith("yso"):
vocabulary.parse_yso_vocabulary(graph)
elif vocabulary_code == "ysa" or vocabulary_code == "allars":
vocabulary.parse_origin_vocabulary(graph)
elif vocabulary_code == "musa" or vocabulary_code == "cilla":
vocabulary.parse_musa_vocabulary(graph, graphs['ysa'])
elif vocabulary_code in ['slm', 'seko']:
vocabulary.parse_label_vocabulary(graph)
self.vocabularies.update({vocabulary_code: vocabulary})
def search(self, keyword, vocabulary_codes, search_geographical_concepts=False, all_languages=False):
"""
kewword: hakusana
vocabulary_codes: dictionary, joka muodostuu sanastokoodi, kielikoodi avainarvopareista
search_geographical_concepts: Boolean-arvo sille, haetaanko käsitettä YSO-paikoista
Virhekoodit:
1: termille ei löytynyt vastinetta sanastoista
2: termille useampi mahdollinen vastine (termille on useampi samanlainen normalisoitu käytettävä termi tai ohjaustermi)
3: termillä ei vastinetta, mutta termillä on täsmälleen yksi sulkutarkenteellinen muoto
4: termillä ei vastinetta, mutta termillä on sulkutarkenteellinen muoto kahdessa tai useammassa käsitteessä
5: termille löytyy vastine, mutta sille on olemassa myös sulkutarkenteellinen muoto eri käsitteessä
6: ketjun osakentän termi poistettu tarpeettomana (fiktio, aiheet, musiikki ja ketjun $e-osakenttä)
7: kentän 650/651 osakentän $g "muut tiedot" on viety kenttään 653
8: Kenttä sisältää MARC-formaattiin kuulumattomia osakenttäkoodeja tai ei sisällä asiasanaosakenttiä
9: tyhjä osakenttä
"""
keyword = unicodedata.normalize('NFKC', keyword)
keyword = keyword.strip()
geographical_concept = False
for vc in vocabulary_codes:
response = {}
if vc[0] == "numeric":
if self.is_numeric(keyword):
response.update({'numeric': True})
response.update({'label': keyword})
if vc[1] == "fi":
response.update({'code': 'yso/fin'})
if vc[1] == "sv":
response.update({'code': 'yso/swe'})
if vc[0] in ['ysa', 'allars', 'musa', 'cilla']:
response = self.vocabularies[vc[0]].get_uris_with_concept(keyword)
if response:
if "uris" in response:
if len(response['uris']) > 1:
raise ValueError("2")
if response["uris"][0] in self.vocabularies[vc[0]].geographical_concepts:
if search_geographical_concepts:
response = self.vocabularies['yso-paikat'].get_concept_with_uri(response["uris"][0], vc[1])
geographical_concept = True
else:
response = None
else:
response = self.vocabularies['yso'].get_concept_with_uri(response["uris"][0], vc[1])
elif vc[0] == "slm" or vc[0] == "seko":
response = self.vocabularies[vc[0]].get_concept_with_label(keyword, vc[1])
if response:
if "uris" in response:
responses = []
responses.append(response)
if all_languages:
vocabulary_code = None
if response['code'].startswith("slm"):
vocabulary_code = "slm"
if response['code'].startswith("yso"):
vocabulary_code = "yso"
if geographical_concept:
vocabulary_code = "yso-paikat"
translated_response = self.vocabularies[vocabulary_code].translate_label(response['uris'][0], vc[1])
if translated_response:
responses.append(self.vocabularies[vocabulary_code].translate_label(response['uris'][0], vc[1]))
for r in responses:
r.update({'geographical': geographical_concept})
r['label'] = self.normalize_characters(r['label'])
#HUOM! Vocabularyn on palautettava vastauksessa sanastokoodi, esim. YSO-paikat
if len(response['uris']) > 1:
raise ValueError("2")
if len(response['uris']) == 1:
return responses
if "numeric" in response:
response.update({'geographical': geographical_concept})
return [response]
raise ValueError("1")
def get_missing_relations(self, source_vocabularies, target_vocabularies):
"""
Testataan, löytyykö kaikille YSOon skos:related-suhteessa oleville käsitteille vastinetta YSOsta.
Parametrit:
source_vocabularies: konvertoitavien sanastojen koodit
target_vocabularies: konversion kohdesanastot muodossa "yso", "yso-paikat"
Paluuarvot:
missing_matches: ne lähdesanastojen käsitteet, joista puuttuu close- tai exactMatch
missing_uris: ne kohdesanastojen käsitteet, joista puuttuu uri
"""
missing_relations = []
missing_matches = {}
missing_uris = {}
missing_relations.append(missing_matches)
missing_relations.append(missing_uris)
for source_vocabulary in source_vocabularies:
for label in self.vocabularies[source_vocabulary].labels:
if len(self.vocabularies[source_vocabulary].labels[label]) == 0:
if source_vocabulary in missing_matches:
missing_matches[source_vocabulary].append(label)
else:
missing_matches.update({source_vocabulary: [label]})
for uri in self.vocabularies[source_vocabulary].labels[label]:
replaced_by = False
if not any(uri in self.vocabularies[vc].labels for vc in target_vocabularies):
for vc in target_vocabularies:
if uri in self.vocabularies[vc].deprecated_concepts:
replacers = self.vocabularies[vc].deprecated_concepts[uri]
if replacers:
if any(r not in self.vocabularies[vc].deprecated_concepts for r in replacers):
replaced_by = True
#rekisteröidään käsitteet, jotka on deprekoitu ja joille ei ole korvaajaa:
if not replaced_by:
if source_vocabulary in missing_uris:
missing_uris[source_vocabulary].update({label: uri})
else:
missing_uris.update({source_vocabulary: {label: uri}})
return missing_relations
def is_numeric(self, keyword):
if keyword:
suffixes = ['-luku', '-luvut', '-talet', '-tal', 'ekr.', 'jkr.', 'fkr.', 'eaa.', 'jaa.', 'ekr', 'jkr', 'fkr', 'eaa', 'jaa']
dashes = {"\u002D": "hyphen-minus",
"\u007E": "tilde",
"\u00AD": "soft hyphen",
"\u058A": "armenian hyphen",
"\u05BE": "hebrew punctuation maqaf",
"\u1400": "canadian syllabics hyphen",
"\u1806": "mongolian todo soft hyphen",
"\u2010": "hyphen",
"\u2011": "non-breaking hyphen",
"\u2012": "figure dash",
"\u2013": "en dash",
"\u2014": "em dash",
"\u2015": "horizontal bar",
"\u2053": "swung dash",
"\u207B": "superscript minus",
"\u208B": "subscript minus",
"\u2212": "minus sign",
"\u2E17": "double oblique hyphen",
"\u2E3A": "two-em dash",
"\u2E3B": "three-em dash",
"\u301C": "wave dash",
"\u3030": "wavy dash",
"\u30A0": "katakana-hiragana double hyphen",
"\uFE31": "presentation form for vertical em dash",
"\uFE32": "presentation form for vertical en dash",
"\uFE58": "small em dash",
"\uFE63": "small hyphen-minus",
"\uFF0D": "fullwidth hyphen-minus",
"\u002E": "full stop"}
keyword = keyword.replace(" ", "")
for suffix in suffixes:
keyword = keyword.replace(suffix, "")
keyword = re.sub(r'(?i)' + suffix, '', keyword)
if any(not char.isdigit() and char not in dashes for char in keyword):
return False
"""
TAL/TALET?
Kongressin kirjasto on auktorisoinut aiheina käytettäviä ajanjaksoja LCSH sanastossa. Esimerkkejä sivun lopun taulukossa.
Esitettävän vuosiluvun sijainti ennen tai jälkeen vuoden 0 ilmaistaan liitteellä eaa. tai jaa.
Tällöin konversiossa voidaan [^.*eKr.$|^.*e\.Kr.$|^.*jKr.$|^.*j\.Kr.$] muuttaa muotoon [^.*eaa.$|^.*jaa.$]
"""
return True
return False
def normalize_characters(self, string):
#koodaa skandinaaviset merkit yksiosaisiksi ja muut kaksiosaisiksi:
string = unicodedata.normalize('NFD', string)
return (string.replace("A\u030a", "Å").replace("a\u030a", "å").
replace("A\u0308", "Ä").replace("a\u0308", "ä").
replace("O\u0308", "Ö").replace("o\u0308", "ö"))