Skip to content

Commit 431a660

Browse files
authored
Switch to SSIF 2025 classifications (#68)
1 parent 12a3271 commit 431a660

File tree

17 files changed

+3748
-1686
lines changed

17 files changed

+3748
-1686
lines changed

misc/export_tsv_annif.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@
55

66
from pipeline.storage import get_connection, dict_factory
77
from pipeline.publication import Publication
8-
from pipeline.util import get_title_by_language, get_summary_by_language
8+
from pipeline.util import get_title_by_language, get_summary_by_language, SSIF_SCHEME, SSIF_BASE
9+
from pipeline.ldcache import get_description
910

11+
OLD_SSIF_SCHEME = "https://id.kb.se/term/ssif"
12+
OLD_SSIF_BASE = f"{OLD_SSIF_SCHEME}/"
1013

1114
def dump_tsv(target_language="en", number_of_records=10000, min_level=1, max_level=5):
1215
limit_sql = ""
@@ -27,6 +30,35 @@ def dump_tsv(target_language="en", number_of_records=10000, min_level=1, max_lev
2730
finalized = orjson.loads(row["data"])
2831
publication = Publication(finalized)
2932

33+
# Temporary special handling to make it possible to use old
34+
# DB dump. Rewrites id.kb.se SSIF classifications to the
35+
# new SSIF scheme/base. Also maps converts SSIF 2011->2025;
36+
# duplicated from legacy_ssif.py. To be removed once we have
37+
# SSIF 2025 terms in prod.
38+
updated_classifications = []
39+
for classification in publication.classifications:
40+
if classification.get("inScheme", {}).get("@id") == OLD_SSIF_SCHEME:
41+
classification["inScheme"]["@id"] = SSIF_SCHEME
42+
classification["@id"] = classification["@id"].replace(OLD_SSIF_BASE, SSIF_BASE)
43+
if not (description := get_description(classification["@id"])):
44+
continue
45+
if is_replaced_bys := description.get("isReplacedBy", []):
46+
if len(is_replaced_bys) == 1:
47+
classification = get_description(is_replaced_bys[0]["@id"])
48+
else:
49+
replaced_by_ids = list(map(lambda x: x["@id"].removeprefix(SSIF_BASE), is_replaced_bys))
50+
level_3 = replaced_by_ids[0][:3]
51+
if all(classification.startswith(level_3) for classification in replaced_by_ids):
52+
classification = get_description(f"{SSIF_BASE}{level_3}")
53+
elif (narrow_match := description.get("narrowMatch", [])) and len(narrow_match) == 1:
54+
classification = get_description(narrow_match[0]["@id"])
55+
elif (close_match := description.get("closeMatch", [])) and len(close_match) == 1:
56+
classification = get_description(close_match[0]["@id"])
57+
else:
58+
continue
59+
updated_classifications.append(classification)
60+
publication.classifications = updated_classifications
61+
3062
# Get SSIF codes, filtered by min/max level, and skip records with
3163
# no classification in the desired levels. For example, with
3264
# min_level 3 max_level 5, records with only 1-level classification
@@ -78,7 +110,7 @@ def dump_tsv(target_language="en", number_of_records=10000, min_level=1, max_lev
78110
expanded_ssif.add(ssif[:1])
79111
expanded_ssif.add(ssif[:3])
80112
ssif_str = " ".join(
81-
[f"<https://id.kb.se/term/ssif/{s}>" for s in expanded_ssif]
113+
[f"<{SSIF_BASE}{s}>" for s in expanded_ssif]
82114
)
83115

84116
# Get non-SSIF keywords in the target language

pipeline/audit.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pipeline.auditors.ssif import SSIFAuditor
77
from pipeline.auditors.contributor import ContributorAuditor
88
from pipeline.auditors.issn import ISSNAuditor
9+
from pipeline.auditors.legacy_ssif import LegacySSIFAuditor
910
from pipeline.auditors.subjects import SubjectsAuditor
1011
from pipeline.auditors.oa import OAAuditor
1112
from pipeline.auditors.autoclassifier import AutoclassifierAuditor
@@ -19,6 +20,7 @@
1920
ContributorAuditor(),
2021
SSIFAuditor(),
2122
ISSNAuditor(),
23+
LegacySSIFAuditor(),
2224
SubjectsAuditor(),
2325
OAAuditor(),
2426
AutoclassifierAuditor(),

pipeline/auditors/autoclassifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def _eligible_for_autoclassification(publication):
3131

3232

3333
def _create_classification(code):
34-
classification = get_description(f"https://id.kb.se/term/ssif/{code}").copy()
34+
classification = get_description(f"{SSIF_BASE}{code}").copy()
3535
classification["@annotation"] = {
3636
"assigner": {"@id": SWEPUB_CLASSIFIER_ID}
3737
}

pipeline/auditors/legacy_ssif.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from pipeline.auditors import BaseAuditor
2+
from pipeline.util import SSIF_BASE
3+
from pipeline.ldcache import get_description
4+
5+
class LegacySSIFAuditor(BaseAuditor):
6+
"""Used to migrate classifications to SSIF 2025 when possible"""
7+
8+
def __init__(self):
9+
self.name = LegacySSIFAuditor.__name__
10+
11+
def audit(self, publication, audit_events, _harvest_cache, _session):
12+
ssifs_not_migrated = set()
13+
for classification in publication.classifications:
14+
if not (description := get_description(classification["@id"])):
15+
continue
16+
if is_replaced_bys := description.get("isReplacedBy", []):
17+
if len(is_replaced_bys) == 1:
18+
classification["@id"] = is_replaced_bys[0]["@id"]
19+
else:
20+
replaced_by_ids = list(map(lambda x: x["@id"].removeprefix(SSIF_BASE), is_replaced_bys))
21+
level_3 = replaced_by_ids[0][:3]
22+
if all(classification.startswith(level_3) for classification in replaced_by_ids):
23+
classification["@id"] = f"{SSIF_BASE}{level_3}"
24+
elif (narrow_match := description.get("narrowMatch", [])) and len(narrow_match) == 1:
25+
classification["@id"] = narrow_match[0]["@id"]
26+
elif (close_match := description.get("closeMatch", [])) and len(close_match) == 1:
27+
classification["@id"] = close_match[0]["@id"]
28+
29+
return publication, audit_events

pipeline/legacy_publication.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import dateutil.parser
22
from datetime import datetime
3+
from pipeline.util import SSIF_SCHEME
34

45
genre_form_publication_mappings = {
56
"https://id.kb.se/term/swepub/output/publication/editorial-letter": ["art"],
@@ -715,7 +716,7 @@ def _get_provision_activity_statement(body):
715716

716717

717718
def is_ssif_classification(term):
718-
return term.get("inScheme", {}).get("@id", "").startswith(("https://id.kb.se/term/ssif", "https://id.kb.se/term/uka"))
719+
return term.get("inScheme", {}).get("@id", "").startswith(("https://id.kb.se/term/ssif", "https://id.kb.se/term/uka", SSIF_SCHEME))
719720

720721

721722
def _format_date_as_year(date):

pipeline/tests/converter/test_parser.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import re
22
import pytest
33
from lxml.etree import LxmlError, XMLSyntaxError
4+
from pipeline.util import SSIF_SCHEME, SSIF_BASE
45

56
MODS = """
67
<record xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
@@ -300,11 +301,11 @@ def test_parser(parser):
300301
],
301302
"classification": [
302303
{
303-
"@id": "https://id.kb.se/term/ssif/30220",
304+
"@id": f"{SSIF_BASE}30220",
304305
"@type": "Classification",
305306
"code": "30220",
306307
"prefLabelByLang": {"en": "Obstetrics, Gynecology and Reproductive Medicine"},
307-
"inScheme": {"@id": "https://id.kb.se/term/ssif", "@type": "ConceptScheme"},
308+
"inScheme": {"@id": SSIF_SCHEME, "@type": "ConceptScheme"},
308309
#"broader": {
309310
# "prefLabel": "Clinical Medicine",
310311
# "broader": {
@@ -313,11 +314,11 @@ def test_parser(parser):
313314
#}
314315
},
315316
{
316-
"@id": "https://id.kb.se/term/ssif/30305",
317+
"@id": f"{SSIF_BASE}30305",
317318
"@type": "Classification",
318319
"code": "30305",
319320
"prefLabelByLang": {"en": "Nursing"},
320-
"inScheme": {"@id": "https://id.kb.se/term/ssif", "@type": "ConceptScheme"},
321+
"inScheme": {"@id": SSIF_SCHEME, "@type": "ConceptScheme"},
321322
}
322323
],
323324
"hasNote": [
@@ -2274,17 +2275,17 @@ def test_uka_subjects_with_href(parser):
22742275
""")
22752276
expected = [
22762277
{
2277-
"@id": "https://id.kb.se/term/ssif/60203",
2278+
"@id": f"{SSIF_BASE}60203",
22782279
'@type': 'Classification',
22792280
'code': '60203',
2280-
"inScheme": {"@id": "https://id.kb.se/term/ssif", "@type": "ConceptScheme"},
2281+
"inScheme": {"@id": SSIF_SCHEME, "@type": "ConceptScheme"},
22812282
"prefLabelByLang": {"sv": "Litteraturvetenskap"},
22822283
},
22832284
{
2284-
"@id": "https://id.kb.se/term/ssif/60203",
2285+
"@id": f"{SSIF_BASE}60203",
22852286
'@type': 'Classification',
22862287
'code': '60203',
2287-
"inScheme": {"@id": "https://id.kb.se/term/ssif", "@type": "ConceptScheme"},
2288+
"inScheme": {"@id": SSIF_SCHEME, "@type": "ConceptScheme"},
22882289
"prefLabelByLang": {"en": "Literature"},
22892290
},
22902291
]
@@ -2307,24 +2308,24 @@ def test_uka_subject_once_per_level(parser):
23072308
""")
23082309
expected = [
23092310
{
2310-
"@id": "https://id.kb.se/term/ssif/6",
2311+
"@id": f"{SSIF_BASE}6",
23112312
'@type': 'Classification',
23122313
'code': '6',
2313-
"inScheme": {"@id": "https://id.kb.se/term/ssif", "@type": "ConceptScheme"},
2314+
"inScheme": {"@id": SSIF_SCHEME, "@type": "ConceptScheme"},
23142315
"prefLabelByLang": {"sv": "Humaniora"},
23152316
},
23162317
{
2317-
"@id": "https://id.kb.se/term/ssif/602",
2318+
"@id": f"{SSIF_BASE}602",
23182319
'@type': 'Classification',
23192320
'code': '602',
2320-
"inScheme": {"@id": "https://id.kb.se/term/ssif", "@type": "ConceptScheme"},
2321+
"inScheme": {"@id": SSIF_SCHEME, "@type": "ConceptScheme"},
23212322
"prefLabelByLang": {"sv": "Spr\u00e5k och litteratur"},
23222323
},
23232324
{
2324-
"@id": "https://id.kb.se/term/ssif/60203",
2325+
"@id": f"{SSIF_BASE}60203",
23252326
'@type': 'Classification',
23262327
'code': '60203',
2327-
"inScheme": {"@id": "https://id.kb.se/term/ssif", "@type": "ConceptScheme"},
2328+
"inScheme": {"@id": SSIF_SCHEME, "@type": "ConceptScheme"},
23282329
"prefLabelByLang": {"sv": "Litteraturvetenskap"},
23292330
},
23302331
]
@@ -2486,7 +2487,7 @@ def test_ssif_classification(parser):
24862487
'@annotation': {
24872488
'assigner': {'@type': 'SoftwareAgent', 'label': 'GPT-4'}
24882489
},
2489-
'@id': 'https://id.kb.se/term/ssif/60203',
2490+
'@id': f"{SSIF_BASE}60203",
24902491
}
24912492
]
24922493
actual = parser.parse_mods(raw_xml)['instanceOf']['classification']

pipeline/tests/deduplicator/test_merger.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pipeline.publication import Contribution
77
from pipeline.publication import IsPartOf
88

9-
from pipeline.util import SWEPUB_CLASSIFIER_ID
9+
from pipeline.util import SWEPUB_CLASSIFIER_ID, SSIF_BASE
1010

1111
from flexmock import flexmock
1212

@@ -208,12 +208,12 @@ def test_merge_classifications():
208208
"instanceOf": {
209209
"classification": [
210210
{
211-
"@id": "https://id.kb.se/term/ssif/10606",
211+
"@id": f"{SSIF_BASE}10606",
212212
"@type": "Classification",
213213
"prefLabelByLang": {"en": "Microbiology", "sv": "Mikrobiologi"}
214214
},
215215
{
216-
"@id": "https://id.kb.se/term/ssif/10203",
216+
"@id": f"{SSIF_BASE}10203",
217217
"@type": "Classification",
218218
"prefLabelByLang": {"en": "Bioinformatics", "sv": "Bioinformatik"}
219219
}
@@ -225,12 +225,12 @@ def test_merge_classifications():
225225
"instanceOf": {
226226
"classification": [
227227
{
228-
"@id": "https://id.kb.se/term/ssif/10606",
228+
"@id": f"{SSIF_BASE}10606",
229229
"@type": "Classification",
230230
"prefLabelByLang": {"en": "Microbiology", "sv": "Mikrobiologi"}
231231
},
232232
{
233-
"@id": "https://id.kb.se/term/ssif/30102",
233+
"@id": f"{SSIF_BASE}30102",
234234
"@type": "Classification",
235235
"prefLabelByLang": {"en": "Pharmacology and Toxicology", "sv": "Farmakologi och toxikologi"}
236236
}
@@ -244,17 +244,17 @@ def test_merge_classifications():
244244

245245
assert merged_master.classifications == [
246246
{
247-
"@id": "https://id.kb.se/term/ssif/10606",
247+
"@id": f"{SSIF_BASE}10606",
248248
"@type": "Classification",
249249
"prefLabelByLang": {"en": "Microbiology", "sv": "Mikrobiologi"}
250250
},
251251
{
252-
"@id": "https://id.kb.se/term/ssif/10203",
252+
"@id": f"{SSIF_BASE}10203",
253253
"@type": "Classification",
254254
"prefLabelByLang": {"en": "Bioinformatics", "sv": "Bioinformatik"}
255255
},
256256
{
257-
"@id": "https://id.kb.se/term/ssif/30102",
257+
"@id": f"{SSIF_BASE}30102",
258258
"@type": "Classification",
259259
"prefLabelByLang": {"en": "Pharmacology and Toxicology", "sv": "Farmakologi och toxikologi"}
260260
}
@@ -265,12 +265,12 @@ def test_merge_classifications_with_autoclassified_subject():
265265
"instanceOf": {
266266
"classification": [
267267
{
268-
"@id": "https://id.kb.se/term/ssif/10606",
268+
"@id": f"{SSIF_BASE}10606",
269269
"@type": "Classification",
270270
"prefLabelByLang": {"en": "Microbiology", "sv": "Mikrobiologi"}
271271
},
272272
{
273-
"@id": "https://id.kb.se/term/ssif/10203",
273+
"@id": f"{SSIF_BASE}10203",
274274
"@type": "Classification",
275275
"prefLabelByLang": {"en": "Bioinformatics", "sv": "Bioinformatik"}
276276
}
@@ -284,13 +284,13 @@ def test_merge_classifications_with_autoclassified_subject():
284284
"instanceOf": {
285285
"classification": [
286286
{
287-
"@id": "https://id.kb.se/term/ssif/30105",
287+
"@id": f"{SSIF_BASE}30105",
288288
"@type": "Classification",
289289
"prefLabelByLang": {"en": "Neurosciences", "sv": "Neurovetenskaper"},
290290
"@annotation": {"assigner": {"@id": SWEPUB_CLASSIFIER_ID}}
291291
},
292292
{
293-
"@id": "https://id.kb.se/term/ssif/30102",
293+
"@id": f"{SSIF_BASE}30102",
294294
"@type": "Classification",
295295
"prefLabelByLang": {"en": "Pharmacology and Toxicology", "sv": "Farmakologi och toxikologi"}
296296
}
@@ -304,17 +304,17 @@ def test_merge_classifications_with_autoclassified_subject():
304304

305305
assert merged_master.classifications == [
306306
{
307-
"@id": "https://id.kb.se/term/ssif/10606",
307+
"@id": f"{SSIF_BASE}10606",
308308
"@type": "Classification",
309309
"prefLabelByLang": {"en": "Microbiology", "sv": "Mikrobiologi"}
310310
},
311311
{
312-
"@id": "https://id.kb.se/term/ssif/10203",
312+
"@id": f"{SSIF_BASE}10203",
313313
"@type": "Classification",
314314
"prefLabelByLang": {"en": "Bioinformatics", "sv": "Bioinformatik"}
315315
},
316316
{
317-
"@id": "https://id.kb.se/term/ssif/30102",
317+
"@id": f"{SSIF_BASE}30102",
318318
"@type": "Classification",
319319
"prefLabelByLang": {"en": "Pharmacology and Toxicology", "sv": "Farmakologi och toxikologi"}
320320
}

pipeline/util.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from aenum import Enum
2+
from pathlib import Path
23

34
from difflib import SequenceMatcher
45
from jsonpath_rw import parse
@@ -10,6 +11,7 @@
1011

1112
from requests.adapters import Retry
1213
from random import random
14+
from lxml import etree
1315

1416
from pipeline.swepublog import logger as log
1517

@@ -39,7 +41,7 @@
3941

4042
SWEPUB_CLASSIFIER_ID = "https://id.kb.se/generator/swepub-classifier"
4143

42-
SSIF_SCHEME = 'https://id.kb.se/term/ssif'
44+
SSIF_SCHEME = etree.parse(Path(__file__).parent / "../resources/ssif_scheme.xml").getroot().text
4345
SSIF_BASE = f'{SSIF_SCHEME}/'
4446

4547

pipeline/validate.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from pipeline.normalize import *
88

9-
from pipeline.util import get_at_path, remove_at_path, FieldMeta, Enrichment, Validation, Normalization
9+
from pipeline.util import get_at_path, remove_at_path, FieldMeta, Enrichment, Validation, Normalization, SSIF_SCHEME
1010

1111
from pipeline.validators.datetime import validate_date_time
1212
from pipeline.validators.doi import validate_doi
@@ -59,7 +59,7 @@
5959
'instanceOf.subject[?(@.@type=="Topic")].prefLabel',
6060
'instanceOf.hasNote[?(@.@type=="Note")].label',
6161
),
62-
"SSIF": ('instanceOf.classification[?(@.inScheme.@id=="https://id.kb.se/term/ssif")].code',),
62+
"SSIF": (f"instanceOf.classification[*].@id",),
6363
}
6464

6565
PRECOMPILED_PATHS = {k: [parse(p) for p in v] for k, v in PATHS.items()}
@@ -131,8 +131,10 @@ def validate_stuff(field_events, session, harvest_cache, body, source, cached_pa
131131
validate_date_time(field)
132132
if field.id_type == "creator_count":
133133
validate_creator_count(field)
134-
if field.id_type == "SSIF":
135-
validate_ssif(field)
134+
# SSIF is not enriched, so don't validate it twice.
135+
if field.validation_status == Validation.PENDING:
136+
if field.id_type == "SSIF":
137+
validate_ssif(field)
136138
if field.id_type == "free_text":
137139
field.validation_status = Validation.VALID # formerly "AcceptingValidator"
138140

0 commit comments

Comments
 (0)