Skip to content

Commit 369b124

Browse files
authored
Merge pull request #34 from GSA/load-test-fixes
ckanification utilities
2 parents 0c37581 + 3b91a49 commit 369b124

File tree

6 files changed

+202
-10
lines changed

6 files changed

+202
-10
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ clean-dist: ## Cleans dist dir
1111
rm -rf dist/*
1212

1313
test: up ## Runs poetry tests, ignores ckan load
14-
poetry run pytest --ignore=./tests/integration
14+
poetry run pytest --ignore=./tests/integration --ignore=./scripts/load_test.py
1515

1616
up: ## Sets up local docker environment
1717
docker compose up -d

harvester/ckan_utils.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import re
2+
3+
# all of these are copy/pasted from ckan core
4+
# https://github.com/ckan/ckan/blob/master/ckan/lib/munge.py
5+
6+
PACKAGE_NAME_MAX_LENGTH = 100
7+
PACKAGE_NAME_MIN_LENGTH = 2
8+
9+
MAX_TAG_LENGTH = 100
10+
MIN_TAG_LENGTH = 2
11+
12+
13+
def _munge_to_length(string: str, min_length: int, max_length: int) -> str:
14+
"""Pad/truncates a string"""
15+
if len(string) < min_length:
16+
string += "_" * (min_length - len(string))
17+
if len(string) > max_length:
18+
string = string[:max_length]
19+
return string
20+
21+
22+
def substitute_ascii_equivalents(text_unicode: str) -> str:
23+
# Method taken from: http://code.activestate.com/recipes/251871/
24+
"""
25+
This takes a UNICODE string and replaces Latin-1 characters with something
26+
equivalent in 7-bit ASCII. It returns a plain ASCII string. This function
27+
makes a best effort to convert Latin-1 characters into ASCII equivalents.
28+
It does not just strip out the Latin-1 characters. All characters in the
29+
standard 7-bit ASCII range are preserved. In the 8th bit range all the
30+
Latin-1 accented letters are converted to unaccented equivalents. Most
31+
symbol characters are converted to something meaningful. Anything not
32+
converted is deleted.
33+
"""
34+
char_mapping = {
35+
0xC0: "A",
36+
0xC1: "A",
37+
0xC2: "A",
38+
0xC3: "A",
39+
0xC4: "A",
40+
0xC5: "A",
41+
0xC6: "Ae",
42+
0xC7: "C",
43+
0xC8: "E",
44+
0xC9: "E",
45+
0xCA: "E",
46+
0xCB: "E",
47+
0xCC: "I",
48+
0xCD: "I",
49+
0xCE: "I",
50+
0xCF: "I",
51+
0xD0: "Th",
52+
0xD1: "N",
53+
0xD2: "O",
54+
0xD3: "O",
55+
0xD4: "O",
56+
0xD5: "O",
57+
0xD6: "O",
58+
0xD8: "O",
59+
0xD9: "U",
60+
0xDA: "U",
61+
0xDB: "U",
62+
0xDC: "U",
63+
0xDD: "Y",
64+
0xDE: "th",
65+
0xDF: "ss",
66+
0xE0: "a",
67+
0xE1: "a",
68+
0xE2: "a",
69+
0xE3: "a",
70+
0xE4: "a",
71+
0xE5: "a",
72+
0xE6: "ae",
73+
0xE7: "c",
74+
0xE8: "e",
75+
0xE9: "e",
76+
0xEA: "e",
77+
0xEB: "e",
78+
0xEC: "i",
79+
0xED: "i",
80+
0xEE: "i",
81+
0xEF: "i",
82+
0xF0: "th",
83+
0xF1: "n",
84+
0xF2: "o",
85+
0xF3: "o",
86+
0xF4: "o",
87+
0xF5: "o",
88+
0xF6: "o",
89+
0xF8: "o",
90+
0xF9: "u",
91+
0xFA: "u",
92+
0xFB: "u",
93+
0xFC: "u",
94+
0xFD: "y",
95+
0xFE: "th",
96+
0xFF: "y",
97+
# 0xa1: '!', 0xa2: '{cent}', 0xa3: '{pound}', 0xa4: '{currency}',
98+
# 0xa5: '{yen}', 0xa6: '|', 0xa7: '{section}', 0xa8: '{umlaut}',
99+
# 0xa9: '{C}', 0xaa: '{^a}', 0xab: '<<', 0xac: '{not}',
100+
# 0xad: '-', 0xae: '{R}', 0xaf: '_', 0xb0: '{degrees}',
101+
# 0xb1: '{+/-}', 0xb2: '{^2}', 0xb3: '{^3}', 0xb4:"'",
102+
# 0xb5: '{micro}', 0xb6: '{paragraph}', 0xb7: '*', 0xb8: '{cedilla}',
103+
# 0xb9: '{^1}', 0xba: '{^o}', 0xbb: '>>',
104+
# 0xbc: '{1/4}', 0xbd: '{1/2}', 0xbe: '{3/4}', 0xbf: '?',
105+
# 0xd7: '*', 0xf7: '/'
106+
}
107+
108+
r = ""
109+
for char in text_unicode:
110+
if ord(char) in char_mapping:
111+
r += char_mapping[ord(char)]
112+
elif ord(char) >= 0x80:
113+
pass
114+
else:
115+
r += str(char)
116+
return r
117+
118+
119+
def munge_title_to_name(name: str) -> str:
120+
"""Munge a package title into a package name."""
121+
name = substitute_ascii_equivalents(name)
122+
# convert spaces and separators
123+
name = re.sub("[ .:/]", "-", name)
124+
# take out not-allowed characters
125+
name = re.sub("[^a-zA-Z0-9-_]", "", name).lower()
126+
# remove doubles
127+
name = re.sub("-+", "-", name)
128+
# remove leading or trailing hyphens
129+
name = name.strip("-")
130+
# if longer than max_length, keep last word if a year
131+
max_length = PACKAGE_NAME_MAX_LENGTH - 5
132+
# (make length less than max, in case we need a few for '_' chars
133+
# to de-clash names.)
134+
if len(name) > max_length:
135+
year_match = re.match(r".*?[_-]((?:\d{2,4}[-/])?\d{2,4})$", name)
136+
if year_match:
137+
year = year_match.groups()[0]
138+
name = "%s-%s" % (name[: (max_length - len(year) - 1)], year)
139+
else:
140+
name = name[:max_length]
141+
name = _munge_to_length(name, PACKAGE_NAME_MIN_LENGTH, PACKAGE_NAME_MAX_LENGTH)
142+
return name
143+
144+
145+
def munge_tag(tag: str) -> str:
146+
tag = substitute_ascii_equivalents(tag)
147+
tag = tag.lower().strip()
148+
tag = re.sub(r"[^a-zA-Z0-9\- ]", "", tag).replace(" ", "-")
149+
tag = _munge_to_length(tag, MIN_TAG_LENGTH, MAX_TAG_LENGTH)
150+
return tag

harvester/harvest.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,14 @@
1616
from dotenv import load_dotenv
1717
from jsonschema import Draft202012Validator
1818

19-
# TODO: add relative import "." for utils
2019
from .utils import (
2120
S3Handler,
2221
convert_set_to_list,
2322
dataset_to_hash,
2423
open_json,
2524
sort_dataset,
2625
)
26+
from .ckan_utils import munge_tag, munge_title_to_name
2727

2828
load_dotenv()
2929

@@ -529,8 +529,7 @@ def create_ckan_tags(self, keywords: list[str]) -> list:
529529
output = []
530530

531531
for keyword in keywords:
532-
keyword = "-".join(keyword.split())
533-
output.append({"name": keyword})
532+
output.append({"name": munge_tag(keyword)})
534533

535534
return output
536535

@@ -569,10 +568,7 @@ def create_ckan_resources(self, metadata: dict) -> list[dict]:
569568

570569
def simple_transform(self, metadata: dict) -> dict:
571570
output = {
572-
# "name": "-".join(str(metadata["title"]).lower().replace(".", "").split()),
573-
"name": "".join([s for s in str(metadata["title"]).lower() if s.isalnum()])[
574-
:99
575-
], # TODO: need to add the random character suffix thing
571+
"name": munge_title_to_name(metadata["title"]),
576572
"owner_org": self.harvest_source.owner_org,
577573
"identifier": metadata["identifier"],
578574
"author": None, # TODO: CHANGE THIS!

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "datagov-harvesting-logic"
3-
version = "0.3.1"
3+
version = "0.3.2"
44
description = ""
55
# authors = [
66
# {name = "Jin Sun", email = "jin.sun@gsa.gov"},

tests/unit/load/test_ckan_load.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def test_ckanify_dcatus(self, dcatus_config):
4343
harvest_source.get_harvest_records_as_id_hash()
4444

4545
expected_result = {
46-
"name": "commitmentoftraders",
46+
"name": "commitment-of-traders",
4747
"owner_org": "example_organization",
4848
"identifier": "cftc-dc1",
4949
"author": None,

tests/unit/utils/test_utils.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from harvester.ckan_utils import munge_tag, munge_title_to_name
2+
import pytest
3+
4+
# these tests are copied from
5+
# https://github.com/ckan/ckan/blob/master/ckan/tests/lib/test_munge.py
6+
7+
8+
class TestCKANUtils:
9+
@pytest.mark.parametrize(
10+
"original,expected",
11+
[
12+
("unchanged", "unchanged"),
13+
("s", "s_"), # too short
14+
("some spaces here", "some-spaces--here"),
15+
("random:other%characters&_.here", "randomothercharactershere"),
16+
("river-water-dashes", "river-water-dashes"),
17+
],
18+
)
19+
def test_munge_tag_multiple_pass(self, original, expected):
20+
"""Munge a list of tags muliple times gives expected results."""
21+
22+
first_munge = munge_tag(original)
23+
assert first_munge == expected
24+
second_munge = munge_tag(first_munge)
25+
assert second_munge == expected
26+
27+
@pytest.mark.parametrize(
28+
"original,expected",
29+
[
30+
("unchanged", "unchanged"),
31+
("some spaces here &here", "some-spaces-here-here"),
32+
("s", "s_"), # too short
33+
("random:other%character&", "random-othercharacter"),
34+
("u with umlaut \xfc", "u-with-umlaut-u"),
35+
("reallylong" * 12, "reallylong" * 9 + "reall"),
36+
("reallylong" * 12 + " - 2012", "reallylong" * 9 + "-2012"),
37+
(
38+
"10cm - 50cm Near InfraRed (NI) Digital Aerial Photography (AfA142)",
39+
"10cm-50cm-near-infrared-ni-digital-aerial-photography-afa142",
40+
),
41+
],
42+
)
43+
def test_munge_title_to_name(self, original, expected):
44+
"""Munge a list of names gives expected results."""
45+
munge = munge_title_to_name(original)
46+
assert munge == expected

0 commit comments

Comments
 (0)