Merge pull request #34 from GSA/load-test-fixes

rshewitt · web-flow · commit 369b124539b9 · 2024-02-06T16:16:23.000-07:00
ckanification utilities
diff --git a/Makefile b/Makefile
@@ -11,7 +11,7 @@ clean-dist:  ## Cleans dist dir
 	rm -rf dist/*
 
 test: up ## Runs poetry tests, ignores ckan load
-	poetry run pytest --ignore=./tests/integration
+	poetry run pytest --ignore=./tests/integration  --ignore=./scripts/load_test.py
 
 up: ## Sets up local docker environment
 	docker compose up -d
diff --git a/harvester/ckan_utils.py b/harvester/ckan_utils.py
@@ -0,0 +1,150 @@
+import re
+
+# all of these are copy/pasted from ckan core
+# https://github.com/ckan/ckan/blob/master/ckan/lib/munge.py
+
+PACKAGE_NAME_MAX_LENGTH = 100
+PACKAGE_NAME_MIN_LENGTH = 2
+
+MAX_TAG_LENGTH = 100
+MIN_TAG_LENGTH = 2
+
+
+def _munge_to_length(string: str, min_length: int, max_length: int) -> str:
+    """Pad/truncates a string"""
+    if len(string) < min_length:
+        string += "_" * (min_length - len(string))
+    if len(string) > max_length:
+        string = string[:max_length]
+    return string
+
+
+def substitute_ascii_equivalents(text_unicode: str) -> str:
+    # Method taken from: http://code.activestate.com/recipes/251871/
+    """
+    This takes a UNICODE string and replaces Latin-1 characters with something
+    equivalent in 7-bit ASCII. It returns a plain ASCII string. This function
+    makes a best effort to convert Latin-1 characters into ASCII equivalents.
+    It does not just strip out the Latin-1 characters. All characters in the
+    standard 7-bit ASCII range are preserved. In the 8th bit range all the
+    Latin-1 accented letters are converted to unaccented equivalents. Most
+    symbol characters are converted to something meaningful. Anything not
+    converted is deleted.
+    """
+    char_mapping = {
+        0xC0: "A",
+        0xC1: "A",
+        0xC2: "A",
+        0xC3: "A",
+        0xC4: "A",
+        0xC5: "A",
+        0xC6: "Ae",
+        0xC7: "C",
+        0xC8: "E",
+        0xC9: "E",
+        0xCA: "E",
+        0xCB: "E",
+        0xCC: "I",
+        0xCD: "I",
+        0xCE: "I",
+        0xCF: "I",
+        0xD0: "Th",
+        0xD1: "N",
+        0xD2: "O",
+        0xD3: "O",
+        0xD4: "O",
+        0xD5: "O",
+        0xD6: "O",
+        0xD8: "O",
+        0xD9: "U",
+        0xDA: "U",
+        0xDB: "U",
+        0xDC: "U",
+        0xDD: "Y",
+        0xDE: "th",
+        0xDF: "ss",
+        0xE0: "a",
+        0xE1: "a",
+        0xE2: "a",
+        0xE3: "a",
+        0xE4: "a",
+        0xE5: "a",
+        0xE6: "ae",
+        0xE7: "c",
+        0xE8: "e",
+        0xE9: "e",
+        0xEA: "e",
+        0xEB: "e",
+        0xEC: "i",
+        0xED: "i",
+        0xEE: "i",
+        0xEF: "i",
+        0xF0: "th",
+        0xF1: "n",
+        0xF2: "o",
+        0xF3: "o",
+        0xF4: "o",
+        0xF5: "o",
+        0xF6: "o",
+        0xF8: "o",
+        0xF9: "u",
+        0xFA: "u",
+        0xFB: "u",
+        0xFC: "u",
+        0xFD: "y",
+        0xFE: "th",
+        0xFF: "y",
+        # 0xa1: '!', 0xa2: '{cent}', 0xa3: '{pound}', 0xa4: '{currency}',
+        # 0xa5: '{yen}', 0xa6: '|', 0xa7: '{section}', 0xa8: '{umlaut}',
+        # 0xa9: '{C}', 0xaa: '{^a}', 0xab: '<<', 0xac: '{not}',
+        # 0xad: '-', 0xae: '{R}', 0xaf: '_', 0xb0: '{degrees}',
+        # 0xb1: '{+/-}', 0xb2: '{^2}', 0xb3: '{^3}', 0xb4:"'",
+        # 0xb5: '{micro}', 0xb6: '{paragraph}', 0xb7: '*', 0xb8: '{cedilla}',
+        # 0xb9: '{^1}', 0xba: '{^o}', 0xbb: '>>',
+        # 0xbc: '{1/4}', 0xbd: '{1/2}', 0xbe: '{3/4}', 0xbf: '?',
+        # 0xd7: '*', 0xf7: '/'
+    }
+
+    r = ""
+    for char in text_unicode:
+        if ord(char) in char_mapping:
+            r += char_mapping[ord(char)]
+        elif ord(char) >= 0x80:
+            pass
+        else:
+            r += str(char)
+    return r
+
+
+def munge_title_to_name(name: str) -> str:
+    """Munge a package title into a package name."""
+    name = substitute_ascii_equivalents(name)
+    # convert spaces and separators
+    name = re.sub("[ .:/]", "-", name)
+    # take out not-allowed characters
+    name = re.sub("[^a-zA-Z0-9-_]", "", name).lower()
+    # remove doubles
+    name = re.sub("-+", "-", name)
+    # remove leading or trailing hyphens
+    name = name.strip("-")
+    # if longer than max_length, keep last word if a year
+    max_length = PACKAGE_NAME_MAX_LENGTH - 5
+    # (make length less than max, in case we need a few for '_' chars
+    # to de-clash names.)
+    if len(name) > max_length:
+        year_match = re.match(r".*?[_-]((?:\d{2,4}[-/])?\d{2,4})$", name)
+        if year_match:
+            year = year_match.groups()[0]
+            name = "%s-%s" % (name[: (max_length - len(year) - 1)], year)
+        else:
+            name = name[:max_length]
+    name = _munge_to_length(name, PACKAGE_NAME_MIN_LENGTH, PACKAGE_NAME_MAX_LENGTH)
+    return name
+
+
+def munge_tag(tag: str) -> str:
+    tag = substitute_ascii_equivalents(tag)
+    tag = tag.lower().strip()
+    tag = re.sub(r"[^a-zA-Z0-9\- ]", "", tag).replace(" ", "-")
+    tag = _munge_to_length(tag, MIN_TAG_LENGTH, MAX_TAG_LENGTH)
+    return tag
diff --git a/harvester/harvest.py b/harvester/harvest.py
@@ -16,14 +16,14 @@
 from dotenv import load_dotenv
 from jsonschema import Draft202012Validator
 
-# TODO: add relative import "." for utils
 from .utils import (
     S3Handler,
     convert_set_to_list,
     dataset_to_hash,
     open_json,
     sort_dataset,
 )
+from .ckan_utils import munge_tag, munge_title_to_name
 
 load_dotenv()
 
@@ -529,8 +529,7 @@ def create_ckan_tags(self, keywords: list[str]) -> list:
         output = []
 
         for keyword in keywords:
-            keyword = "-".join(keyword.split())
-            output.append({"name": keyword})
+            output.append({"name": munge_tag(keyword)})
 
         return output
 
@@ -569,10 +568,7 @@ def create_ckan_resources(self, metadata: dict) -> list[dict]:
 
     def simple_transform(self, metadata: dict) -> dict:
         output = {
-            # "name": "-".join(str(metadata["title"]).lower().replace(".", "").split()),
-            "name": "".join([s for s in str(metadata["title"]).lower() if s.isalnum()])[
-                :99
-            ],  # TODO: need to add the random character suffix thing
+            "name": munge_title_to_name(metadata["title"]),
             "owner_org": self.harvest_source.owner_org,
             "identifier": metadata["identifier"],
             "author": None,  # TODO: CHANGE THIS!
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datagov-harvesting-logic"
-version = "0.3.1"
+version = "0.3.2"
 description = ""
 # authors = [
 #     {name = "Jin Sun", email = "jin.sun@gsa.gov"},
diff --git a/tests/unit/load/test_ckan_load.py b/tests/unit/load/test_ckan_load.py
@@ -43,7 +43,7 @@ def test_ckanify_dcatus(self, dcatus_config):
         harvest_source.get_harvest_records_as_id_hash()
 
         expected_result = {
-            "name": "commitmentoftraders",
+            "name": "commitment-of-traders",
             "owner_org": "example_organization",
             "identifier": "cftc-dc1",
             "author": None,
diff --git a/tests/unit/utils/test_utils.py b/tests/unit/utils/test_utils.py
@@ -0,0 +1,46 @@
+from harvester.ckan_utils import munge_tag, munge_title_to_name
+import pytest
+
+# these tests are copied from
+# https://github.com/ckan/ckan/blob/master/ckan/tests/lib/test_munge.py
+
+
+class TestCKANUtils:
+    @pytest.mark.parametrize(
+        "original,expected",
+        [
+            ("unchanged", "unchanged"),
+            ("s", "s_"),  # too short
+            ("some spaces  here", "some-spaces--here"),
+            ("random:other%characters&_.here", "randomothercharactershere"),
+            ("river-water-dashes", "river-water-dashes"),
+        ],
+    )
+    def test_munge_tag_multiple_pass(self, original, expected):
+        """Munge a list of tags muliple times gives expected results."""
+
+        first_munge = munge_tag(original)
+        assert first_munge == expected
+        second_munge = munge_tag(first_munge)
+        assert second_munge == expected
+
+    @pytest.mark.parametrize(
+        "original,expected",
+        [
+            ("unchanged", "unchanged"),
+            ("some spaces  here    &here", "some-spaces-here-here"),
+            ("s", "s_"),  # too short
+            ("random:other%character&", "random-othercharacter"),
+            ("u with umlaut \xfc", "u-with-umlaut-u"),
+            ("reallylong" * 12, "reallylong" * 9 + "reall"),
+            ("reallylong" * 12 + " - 2012", "reallylong" * 9 + "-2012"),
+            (
+                "10cm - 50cm Near InfraRed (NI) Digital Aerial Photography (AfA142)",
+                "10cm-50cm-near-infrared-ni-digital-aerial-photography-afa142",
+            ),
+        ],
+    )
+    def test_munge_title_to_name(self, original, expected):
+        """Munge a list of names gives expected results."""
+        munge = munge_title_to_name(original)
+        assert munge == expected