GSA · rshewitt · Jan 4, 2024 · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023
diff --git a/Makefile b/Makefile
@@ -11,7 +11,7 @@ clean-dist:  ## Cleans dist dir
 	rm -rf dist/*
 
 test: up ## Runs poetry tests, ignores ckan load
-	poetry run pytest --ignore=./tests/load/ckan
+	poetry run pytest --ignore=./tests/integration
 
 up: ## Sets up local docker environment
 	docker compose up -d

diff --git a/README.md b/README.md
@@ -5,35 +5,86 @@ transformation, and loading into the data.gov catalog.
 
 ## Features
 
-The datagov-harvesting-logic offers the following features:
-
 - Extract
-  - general purpose fetching and downloading of web resources.
-  - catered extraction to the following data formats:
+  - General purpose fetching and downloading of web resources.
+  - Catered extraction to the following data formats:
     - DCAT-US
 - Validation
   - DCAT-US
-    - jsonschema validation using draft 2020-12.
+    - `jsonschema` validation using draft 2020-12.
 - Load
   - DCAT-US
-    - conversion of dcatu-us catalog into ckan dataset schema
-    - create, delete, update, and patch of ckan package/dataset
+    - Conversion of dcat-us catalog into ckan dataset schema
+    - Create, delete, update, and patch of ckan package/dataset
 
 ## Requirements
 
-This project is using poetry to manage this project. Install [here](https://python-poetry.org/docs/#installation).
+This project is using `poetry` to manage this project. Install [here](https://python-poetry.org/docs/#installation).
 
 Once installed, `poetry install` installs dependencies into a local virtual environment.
 
 ## Testing
+
 ### CKAN load testing
+
 - CKAN load testing doesn't require the services provided in the `docker-compose.yml`.
 - [catalog-dev](https://catalog-dev.data.gov/) is used for ckan load testing.
-- Create an api-key by signing into catalog-dev. 
+- Create an api-key by signing into catalog-dev.
 - Create a `credentials.py` file at the root of the project containing the variable `ckan_catalog_dev_api_key` assigned to the api-key.
-- run tests with the command `poetry run pytest ./tests/load/ckan`
+- Run tests with the command `poetry run pytest ./tests/load/ckan`
+
 ### Harvester testing
-- These tests are found in `extract`, and `validate`. Some of them rely on services in the `docker-compose.yml`. run using docker `docker compose up -d` and with the command `poetry run pytest --ignore=./tests/load/ckan`. 
+
+- These tests are found in `extract`, and `validate`. Some of them rely on services in the `docker-compose.yml`. Run using docker `docker compose up -d` and with the command `poetry run pytest --ignore=./tests/load/ckan`.
 
 If you followed the instructions for `CKAN load testing` and `Harvester testing` you can simply run `poetry run pytest` to run all tests.
 
+## Comparison
+
+- `./tests/harvest_sources/ckan_datasets_resp.json`
+  - Represents what ckan would respond with after querying for the harvest source name
+- `./tests/harvest_sources/dcatus_compare.json`
+  - Represents a changed harvest source
+  - Created:
+    - datasets[0]
+
+        ```diff
+        + "identifier" = "cftc-dc10"
+        ```
+
+  - Deleted:
+    - datasets[0]
+
+        ```diff
+        - "identifier" = "cftc-dc1"
+        ```
+
+  - Updated:
+    - datasets[1]
+
+        ```diff
+        - "modified": "R/P1M"
+        + "modified": "R/P1M Update"
+        ```
+
+    - datasets[2]
+
+        ```diff
+        - "keyword": ["cotton on call", "cotton on-call"]
+        + "keyword": ["cotton on call", "cotton on-call", "update keyword"]
+        ```
+
+    - datasets[3]
+
+        ```diff
+        "publisher": {
+          "name": "U.S. Commodity Futures Trading Commission",
+          "subOrganizationOf": {
+        -   "name": "U.S. Government"
+        +   "name": "Changed Value"
+          }
+        }
+        ```
+
+- `./test/harvest_sources/dcatus.json`
+  - Represents an original harvest source prior to change occuring.
diff --git a/harvester/__init__.py b/harvester/__init__.py
@@ -22,14 +22,8 @@
 # TODO these imports will need to be updated to ensure a consistent api
 from .compare import compare
 from .extract import download_waf, extract, traverse_waf
-from .load import (
-    create_ckan_package,
-    dcatus_to_ckan,
-    load,
-    patch_ckan_package,
-    purge_ckan_package,
-    update_ckan_package,
-)
+from .load import (create_ckan_package, dcatus_to_ckan, load,
+                   patch_ckan_package, purge_ckan_package, update_ckan_package)
 from .transform import transform
 from .utils import *
 from .validate import *

diff --git a/harvester/compare.py b/harvester/compare.py
@@ -3,9 +3,22 @@
 logger = logging.getLogger("harvester")
 
 
-# stub, TODO complete
-def compare(compare_obj):
+def compare(harvest_source, ckan_source):
     """Compares records"""
     logger.info("Hello from harvester.compare()")
 
-    return compare_obj
+    output = {
+        "create": [],
+        "update": [],
+        "delete": [],
+    }
+
+    harvest_ids = set(harvest_source.keys())
+    ckan_ids = set(ckan_source.keys())
+    same_ids = harvest_ids & ckan_ids
+
+    output["create"] += list(harvest_ids - ckan_ids)
+    output["delete"] += list(ckan_ids - harvest_ids)
+    output["update"] += [i for i in same_ids if harvest_source[i] != ckan_source[i]]
+
+    return output
diff --git a/harvester/load.py b/harvester/load.py
@@ -3,6 +3,8 @@
 
 import ckanapi
 
+from harvester.utils.util import sort_dataset
+
 logger = logging.getLogger("harvester")
 
 
@@ -21,7 +23,7 @@ def create_ckan_extra_base(*args):
     return [{"key": d[0], "value": d[1]} for d in data]
 
 
-def create_ckan_extras_additions(dcatus_catalog, additions):
+def create_ckan_extras_additions(dcatus_dataset, additions):
     extras = [
         "accessLevel",
         "bureauCode",
@@ -35,10 +37,13 @@ def create_ckan_extras_additions(dcatus_catalog, additions):
 
     for extra in extras:
         data = {"key": extra, "value": None}
+        val = dcatus_dataset[extra]
         if extra == "publisher":
-            data["value"] = dcatus_catalog[extra]["name"]
+            data["value"] = val["name"]
         else:
-            data["value"] = dcatus_catalog[extra]
+            if isinstance(val, list):  # TODO: confirm this is what we want.
+                val = val[0]
+            data["value"] = val
         output.append(data)
 
     return output + additions
@@ -70,21 +75,28 @@ def get_email_from_str(in_str):
         return res.group(0)
 
 
-def create_ckan_resources(dists):
+def create_ckan_resources(dcatus_dataset):
     output = []
 
-    for dist in dists:
+    if "distribution" not in dcatus_dataset:
+        return output
+
+    for dist in dcatus_dataset["distribution"]:
         url_key = "downloadURL" if "downloadURL" in dist else "accessURL"
-        resource = {"url": dist[url_key], "mimetype": dist["mediaType"]}
+        resource = {"url": dist[url_key]}
+        if "mimetype" in dist:
+            resource["mimetype"] = dist["mediaType"]
+
         output.append(resource)
 
     return output
 
 
-def simple_transform(dcatus_catalog):
+def simple_transform(dcatus_dataset):
     output = {
-        "name": "-".join(dcatus_catalog["title"].lower().split()),
-        "owner_org": "test",
+        "name": "-".join(dcatus_dataset["title"].lower().split()),
+        "owner_org": "test",  # TODO: CHANGE THIS!
+        "identifier": dcatus_dataset["identifier"],
     }
 
     mapping = {
@@ -93,14 +105,17 @@ def simple_transform(dcatus_catalog):
         "title": "title",
     }
 
-    for k, v in dcatus_catalog.items():
+    for k, v in dcatus_dataset.items():
         if k not in mapping:
             continue
         if isinstance(mapping[k], dict):
             temp = {}
+            to_skip = ["@type"]
             for k2, v2 in v.items():
                 if k2 == "hasEmail":
                     v2 = get_email_from_str(v2)
+                if k2 in to_skip:
+                    continue
                 temp[mapping[k][k2]] = v2
             output = {**output, **temp}
         else:
@@ -116,7 +131,7 @@ def create_defaults():
     }
 
 
-def dcatus_to_ckan(dcatus_catalog):
+def dcatus_to_ckan(dcatus_dataset, harvest_source_name):
     """
     example:
     - from this:
@@ -126,23 +141,34 @@ def dcatus_to_ckan(dcatus_catalog):
 
     """
 
-    output = simple_transform(dcatus_catalog)
+    output = simple_transform(dcatus_dataset)
 
-    resources = create_ckan_resources(dcatus_catalog["distribution"])
-    tags = create_ckan_tags(dcatus_catalog["keyword"])
-    pubisher_hierarchy = create_ckan_publisher_hierarchy(dcatus_catalog["publisher"])
+    resources = create_ckan_resources(dcatus_dataset)
+    tags = create_ckan_tags(dcatus_dataset["keyword"])
+    pubisher_hierarchy = create_ckan_publisher_hierarchy(
+        dcatus_dataset["publisher"], []
+    )
 
     extras_base = create_ckan_extra_base(
-        pubisher_hierarchy, "Dataset", dcatus_catalog["publisher"]["name"]
+        pubisher_hierarchy, "Dataset", dcatus_dataset["publisher"]["name"]
     )
-    extras = create_ckan_extras_additions(dcatus_catalog, extras_base)
+    extras = create_ckan_extras_additions(dcatus_dataset, extras_base)
 
     defaults = create_defaults()
 
     output["resources"] = resources
     output["tags"] = tags
+
     output["extras"] = extras_base
     output["extras"] += extras
+    output["extras"] += [
+        {
+            "key": "dcat_metadata",
+            "value": str(sort_dataset(dcatus_dataset)),
+        }
+    ]
+
+    output["extras"] += [{"key": "harvest_source_name", "value": harvest_source_name}]
 
     return {**output, **defaults}
 
@@ -167,3 +193,7 @@ def update_ckan_package(ckan, update_data):
 
 def purge_ckan_package(ckan, package_data):
     return ckan.action.dataset_purge(**package_data)
+
+
+def search_ckan(ckan, query):
+    return ckan.action.package_search(**query)
diff --git a/harvester/utils/__init__.py b/harvester/utils/__init__.py
@@ -1,3 +1,3 @@
-from . import json
+from . import json, util
 
-__all__ = ["json"]
+__all__ = ["json", "util"]
diff --git a/harvester/utils/util.py b/harvester/utils/util.py
@@ -0,0 +1,12 @@
+import hashlib
+import json
+
+import sansjson
+
+
+def sort_dataset(d):
+    return sansjson.sort_pyobject(d)
+
+
+def dataset_to_hash(d):
+    return hashlib.sha256(json.dumps(d, sort_keys=True).encode("utf-8")).hexdigest()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datagov-harvesting-logic"
-version = "0.0.4"
+version = "0.1.0"
 description = ""
 # authors = [
 #     {name = "Jin Sun", email = "jin.sun@gsa.gov"},
@@ -25,6 +25,7 @@ deepdiff = ">=6"
 pytest = ">=7.3.2"
 ckanapi = ">=4.7"
 beautifulsoup4 = "^4.12.2"
+sansjson = "^0.3.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.3.0"