Skip to content

Commit 9a278fa

Browse files
authored
Merge pull request #28 from GSA/feature/compare
Feature/compare
2 parents a9aede2 + c31a32e commit 9a278fa

File tree

20 files changed

+1586
-358
lines changed

20 files changed

+1586
-358
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ clean-dist: ## Cleans dist dir
1111
rm -rf dist/*
1212

1313
test: up ## Runs poetry tests, ignores ckan load
14-
poetry run pytest --ignore=./tests/load/ckan
14+
poetry run pytest --ignore=./tests/integration
1515

1616
up: ## Sets up local docker environment
1717
docker compose up -d

README.md

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,86 @@ transformation, and loading into the data.gov catalog.
55

66
## Features
77

8-
The datagov-harvesting-logic offers the following features:
9-
108
- Extract
11-
- general purpose fetching and downloading of web resources.
12-
- catered extraction to the following data formats:
9+
- General purpose fetching and downloading of web resources.
10+
- Catered extraction to the following data formats:
1311
- DCAT-US
1412
- Validation
1513
- DCAT-US
16-
- jsonschema validation using draft 2020-12.
14+
- `jsonschema` validation using draft 2020-12.
1715
- Load
1816
- DCAT-US
19-
- conversion of dcatu-us catalog into ckan dataset schema
20-
- create, delete, update, and patch of ckan package/dataset
17+
- Conversion of dcat-us catalog into ckan dataset schema
18+
- Create, delete, update, and patch of ckan package/dataset
2119

2220
## Requirements
2321

24-
This project is using poetry to manage this project. Install [here](https://python-poetry.org/docs/#installation).
22+
This project is using `poetry` to manage this project. Install [here](https://python-poetry.org/docs/#installation).
2523

2624
Once installed, `poetry install` installs dependencies into a local virtual environment.
2725

2826
## Testing
27+
2928
### CKAN load testing
29+
3030
- CKAN load testing doesn't require the services provided in the `docker-compose.yml`.
3131
- [catalog-dev](https://catalog-dev.data.gov/) is used for ckan load testing.
32-
- Create an api-key by signing into catalog-dev.
32+
- Create an api-key by signing into catalog-dev.
3333
- Create a `credentials.py` file at the root of the project containing the variable `ckan_catalog_dev_api_key` assigned to the api-key.
34-
- run tests with the command `poetry run pytest ./tests/load/ckan`
34+
- Run tests with the command `poetry run pytest ./tests/load/ckan`
35+
3536
### Harvester testing
36-
- These tests are found in `extract`, and `validate`. Some of them rely on services in the `docker-compose.yml`. run using docker `docker compose up -d` and with the command `poetry run pytest --ignore=./tests/load/ckan`.
37+
38+
- These tests are found in `extract`, and `validate`. Some of them rely on services in the `docker-compose.yml`. Run using docker `docker compose up -d` and with the command `poetry run pytest --ignore=./tests/load/ckan`.
3739

3840
If you followed the instructions for `CKAN load testing` and `Harvester testing` you can simply run `poetry run pytest` to run all tests.
3941

42+
## Comparison
43+
44+
- `./tests/harvest_sources/ckan_datasets_resp.json`
45+
- Represents what ckan would respond with after querying for the harvest source name
46+
- `./tests/harvest_sources/dcatus_compare.json`
47+
- Represents a changed harvest source
48+
- Created:
49+
- datasets[0]
50+
51+
```diff
52+
+ "identifier" = "cftc-dc10"
53+
```
54+
55+
- Deleted:
56+
- datasets[0]
57+
58+
```diff
59+
- "identifier" = "cftc-dc1"
60+
```
61+
62+
- Updated:
63+
- datasets[1]
64+
65+
```diff
66+
- "modified": "R/P1M"
67+
+ "modified": "R/P1M Update"
68+
```
69+
70+
- datasets[2]
71+
72+
```diff
73+
- "keyword": ["cotton on call", "cotton on-call"]
74+
+ "keyword": ["cotton on call", "cotton on-call", "update keyword"]
75+
```
76+
77+
- datasets[3]
78+
79+
```diff
80+
"publisher": {
81+
"name": "U.S. Commodity Futures Trading Commission",
82+
"subOrganizationOf": {
83+
- "name": "U.S. Government"
84+
+ "name": "Changed Value"
85+
}
86+
}
87+
```
88+
89+
- `./test/harvest_sources/dcatus.json`
90+
- Represents an original harvest source prior to change occuring.

harvester/__init__.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,8 @@
2222
# TODO these imports will need to be updated to ensure a consistent api
2323
from .compare import compare
2424
from .extract import download_waf, extract, traverse_waf
25-
from .load import (
26-
create_ckan_package,
27-
dcatus_to_ckan,
28-
load,
29-
patch_ckan_package,
30-
purge_ckan_package,
31-
update_ckan_package,
32-
)
25+
from .load import (create_ckan_package, dcatus_to_ckan, load,
26+
patch_ckan_package, purge_ckan_package, update_ckan_package)
3327
from .transform import transform
3428
from .utils import *
3529
from .validate import *

harvester/compare.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,22 @@
33
logger = logging.getLogger("harvester")
44

55

6-
# stub, TODO complete
7-
def compare(compare_obj):
6+
def compare(harvest_source, ckan_source):
87
"""Compares records"""
98
logger.info("Hello from harvester.compare()")
109

11-
return compare_obj
10+
output = {
11+
"create": [],
12+
"update": [],
13+
"delete": [],
14+
}
15+
16+
harvest_ids = set(harvest_source.keys())
17+
ckan_ids = set(ckan_source.keys())
18+
same_ids = harvest_ids & ckan_ids
19+
20+
output["create"] += list(harvest_ids - ckan_ids)
21+
output["delete"] += list(ckan_ids - harvest_ids)
22+
output["update"] += [i for i in same_ids if harvest_source[i] != ckan_source[i]]
23+
24+
return output

harvester/load.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import ckanapi
55

6+
from harvester.utils.util import sort_dataset
7+
68
logger = logging.getLogger("harvester")
79

810

@@ -21,7 +23,7 @@ def create_ckan_extra_base(*args):
2123
return [{"key": d[0], "value": d[1]} for d in data]
2224

2325

24-
def create_ckan_extras_additions(dcatus_catalog, additions):
26+
def create_ckan_extras_additions(dcatus_dataset, additions):
2527
extras = [
2628
"accessLevel",
2729
"bureauCode",
@@ -35,10 +37,13 @@ def create_ckan_extras_additions(dcatus_catalog, additions):
3537

3638
for extra in extras:
3739
data = {"key": extra, "value": None}
40+
val = dcatus_dataset[extra]
3841
if extra == "publisher":
39-
data["value"] = dcatus_catalog[extra]["name"]
42+
data["value"] = val["name"]
4043
else:
41-
data["value"] = dcatus_catalog[extra]
44+
if isinstance(val, list): # TODO: confirm this is what we want.
45+
val = val[0]
46+
data["value"] = val
4247
output.append(data)
4348

4449
return output + additions
@@ -70,21 +75,28 @@ def get_email_from_str(in_str):
7075
return res.group(0)
7176

7277

73-
def create_ckan_resources(dists):
78+
def create_ckan_resources(dcatus_dataset):
7479
output = []
7580

76-
for dist in dists:
81+
if "distribution" not in dcatus_dataset:
82+
return output
83+
84+
for dist in dcatus_dataset["distribution"]:
7785
url_key = "downloadURL" if "downloadURL" in dist else "accessURL"
78-
resource = {"url": dist[url_key], "mimetype": dist["mediaType"]}
86+
resource = {"url": dist[url_key]}
87+
if "mimetype" in dist:
88+
resource["mimetype"] = dist["mediaType"]
89+
7990
output.append(resource)
8091

8192
return output
8293

8394

84-
def simple_transform(dcatus_catalog):
95+
def simple_transform(dcatus_dataset):
8596
output = {
86-
"name": "-".join(dcatus_catalog["title"].lower().split()),
87-
"owner_org": "test",
97+
"name": "-".join(dcatus_dataset["title"].lower().split()),
98+
"owner_org": "test", # TODO: CHANGE THIS!
99+
"identifier": dcatus_dataset["identifier"],
88100
}
89101

90102
mapping = {
@@ -93,14 +105,17 @@ def simple_transform(dcatus_catalog):
93105
"title": "title",
94106
}
95107

96-
for k, v in dcatus_catalog.items():
108+
for k, v in dcatus_dataset.items():
97109
if k not in mapping:
98110
continue
99111
if isinstance(mapping[k], dict):
100112
temp = {}
113+
to_skip = ["@type"]
101114
for k2, v2 in v.items():
102115
if k2 == "hasEmail":
103116
v2 = get_email_from_str(v2)
117+
if k2 in to_skip:
118+
continue
104119
temp[mapping[k][k2]] = v2
105120
output = {**output, **temp}
106121
else:
@@ -116,7 +131,7 @@ def create_defaults():
116131
}
117132

118133

119-
def dcatus_to_ckan(dcatus_catalog):
134+
def dcatus_to_ckan(dcatus_dataset, harvest_source_name):
120135
"""
121136
example:
122137
- from this:
@@ -126,23 +141,34 @@ def dcatus_to_ckan(dcatus_catalog):
126141
127142
"""
128143

129-
output = simple_transform(dcatus_catalog)
144+
output = simple_transform(dcatus_dataset)
130145

131-
resources = create_ckan_resources(dcatus_catalog["distribution"])
132-
tags = create_ckan_tags(dcatus_catalog["keyword"])
133-
pubisher_hierarchy = create_ckan_publisher_hierarchy(dcatus_catalog["publisher"])
146+
resources = create_ckan_resources(dcatus_dataset)
147+
tags = create_ckan_tags(dcatus_dataset["keyword"])
148+
pubisher_hierarchy = create_ckan_publisher_hierarchy(
149+
dcatus_dataset["publisher"], []
150+
)
134151

135152
extras_base = create_ckan_extra_base(
136-
pubisher_hierarchy, "Dataset", dcatus_catalog["publisher"]["name"]
153+
pubisher_hierarchy, "Dataset", dcatus_dataset["publisher"]["name"]
137154
)
138-
extras = create_ckan_extras_additions(dcatus_catalog, extras_base)
155+
extras = create_ckan_extras_additions(dcatus_dataset, extras_base)
139156

140157
defaults = create_defaults()
141158

142159
output["resources"] = resources
143160
output["tags"] = tags
161+
144162
output["extras"] = extras_base
145163
output["extras"] += extras
164+
output["extras"] += [
165+
{
166+
"key": "dcat_metadata",
167+
"value": str(sort_dataset(dcatus_dataset)),
168+
}
169+
]
170+
171+
output["extras"] += [{"key": "harvest_source_name", "value": harvest_source_name}]
146172

147173
return {**output, **defaults}
148174

@@ -167,3 +193,7 @@ def update_ckan_package(ckan, update_data):
167193

168194
def purge_ckan_package(ckan, package_data):
169195
return ckan.action.dataset_purge(**package_data)
196+
197+
198+
def search_ckan(ckan, query):
199+
return ckan.action.package_search(**query)

harvester/utils/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from . import json
1+
from . import json, util
22

3-
__all__ = ["json"]
3+
__all__ = ["json", "util"]

harvester/utils/util.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import hashlib
2+
import json
3+
4+
import sansjson
5+
6+
7+
def sort_dataset(d):
8+
return sansjson.sort_pyobject(d)
9+
10+
11+
def dataset_to_hash(d):
12+
return hashlib.sha256(json.dumps(d, sort_keys=True).encode("utf-8")).hexdigest()

poetry.lock

Lines changed: 13 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "datagov-harvesting-logic"
3-
version = "0.0.4"
3+
version = "0.1.0"
44
description = ""
55
# authors = [
66
# {name = "Jin Sun", email = "jin.sun@gsa.gov"},
@@ -25,6 +25,7 @@ deepdiff = ">=6"
2525
pytest = ">=7.3.2"
2626
ckanapi = ">=4.7"
2727
beautifulsoup4 = "^4.12.2"
28+
sansjson = "^0.3.0"
2829

2930
[tool.poetry.group.dev.dependencies]
3031
pytest = "^7.3.0"

0 commit comments

Comments
 (0)