diff --git a/Makefile b/Makefile index 264bef9c..cc7ce6ed 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ clean-dist: ## Cleans dist dir rm -rf dist/* test: up ## Runs poetry tests, ignores ckan load - poetry run pytest --ignore=./tests/load/ckan + poetry run pytest --ignore=./tests/integration up: ## Sets up local docker environment docker compose up -d diff --git a/README.md b/README.md index f977fd4a..dba131f8 100644 --- a/README.md +++ b/README.md @@ -5,35 +5,86 @@ transformation, and loading into the data.gov catalog. ## Features -The datagov-harvesting-logic offers the following features: - - Extract - - general purpose fetching and downloading of web resources. - - catered extraction to the following data formats: + - General purpose fetching and downloading of web resources. + - Catered extraction to the following data formats: - DCAT-US - Validation - DCAT-US - - jsonschema validation using draft 2020-12. + - `jsonschema` validation using draft 2020-12. - Load - DCAT-US - - conversion of dcatu-us catalog into ckan dataset schema - - create, delete, update, and patch of ckan package/dataset + - Conversion of dcat-us catalog into ckan dataset schema + - Create, delete, update, and patch of ckan package/dataset ## Requirements -This project is using poetry to manage this project. Install [here](https://python-poetry.org/docs/#installation). +This project is using `poetry` to manage this project. Install [here](https://python-poetry.org/docs/#installation). Once installed, `poetry install` installs dependencies into a local virtual environment. ## Testing + ### CKAN load testing + - CKAN load testing doesn't require the services provided in the `docker-compose.yml`. - [catalog-dev](https://catalog-dev.data.gov/) is used for ckan load testing. -- Create an api-key by signing into catalog-dev. +- Create an api-key by signing into catalog-dev. - Create a `credentials.py` file at the root of the project containing the variable `ckan_catalog_dev_api_key` assigned to the api-key. -- run tests with the command `poetry run pytest ./tests/load/ckan` +- Run tests with the command `poetry run pytest ./tests/load/ckan` + ### Harvester testing -- These tests are found in `extract`, and `validate`. Some of them rely on services in the `docker-compose.yml`. run using docker `docker compose up -d` and with the command `poetry run pytest --ignore=./tests/load/ckan`. + +- These tests are found in `extract`, and `validate`. Some of them rely on services in the `docker-compose.yml`. Run using docker `docker compose up -d` and with the command `poetry run pytest --ignore=./tests/load/ckan`. If you followed the instructions for `CKAN load testing` and `Harvester testing` you can simply run `poetry run pytest` to run all tests. +## Comparison + +- `./tests/harvest_sources/ckan_datasets_resp.json` + - Represents what ckan would respond with after querying for the harvest source name +- `./tests/harvest_sources/dcatus_compare.json` + - Represents a changed harvest source + - Created: + - datasets[0] + + ```diff + + "identifier" = "cftc-dc10" + ``` + + - Deleted: + - datasets[0] + + ```diff + - "identifier" = "cftc-dc1" + ``` + + - Updated: + - datasets[1] + + ```diff + - "modified": "R/P1M" + + "modified": "R/P1M Update" + ``` + + - datasets[2] + + ```diff + - "keyword": ["cotton on call", "cotton on-call"] + + "keyword": ["cotton on call", "cotton on-call", "update keyword"] + ``` + + - datasets[3] + + ```diff + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + - "name": "U.S. Government" + + "name": "Changed Value" + } + } + ``` + +- `./test/harvest_sources/dcatus.json` + - Represents an original harvest source prior to change occuring. diff --git a/harvester/__init__.py b/harvester/__init__.py index 8f14b9fe..7723990a 100644 --- a/harvester/__init__.py +++ b/harvester/__init__.py @@ -22,14 +22,8 @@ # TODO these imports will need to be updated to ensure a consistent api from .compare import compare from .extract import download_waf, extract, traverse_waf -from .load import ( - create_ckan_package, - dcatus_to_ckan, - load, - patch_ckan_package, - purge_ckan_package, - update_ckan_package, -) +from .load import (create_ckan_package, dcatus_to_ckan, load, + patch_ckan_package, purge_ckan_package, update_ckan_package) from .transform import transform from .utils import * from .validate import * diff --git a/harvester/compare.py b/harvester/compare.py index 9d1bff7a..feebcd6b 100644 --- a/harvester/compare.py +++ b/harvester/compare.py @@ -3,9 +3,22 @@ logger = logging.getLogger("harvester") -# stub, TODO complete -def compare(compare_obj): +def compare(harvest_source, ckan_source): """Compares records""" logger.info("Hello from harvester.compare()") - return compare_obj + output = { + "create": [], + "update": [], + "delete": [], + } + + harvest_ids = set(harvest_source.keys()) + ckan_ids = set(ckan_source.keys()) + same_ids = harvest_ids & ckan_ids + + output["create"] += list(harvest_ids - ckan_ids) + output["delete"] += list(ckan_ids - harvest_ids) + output["update"] += [i for i in same_ids if harvest_source[i] != ckan_source[i]] + + return output diff --git a/harvester/load.py b/harvester/load.py index 794f51f4..53a2c846 100644 --- a/harvester/load.py +++ b/harvester/load.py @@ -3,6 +3,8 @@ import ckanapi +from harvester.utils.util import sort_dataset + logger = logging.getLogger("harvester") @@ -21,7 +23,7 @@ def create_ckan_extra_base(*args): return [{"key": d[0], "value": d[1]} for d in data] -def create_ckan_extras_additions(dcatus_catalog, additions): +def create_ckan_extras_additions(dcatus_dataset, additions): extras = [ "accessLevel", "bureauCode", @@ -35,10 +37,13 @@ def create_ckan_extras_additions(dcatus_catalog, additions): for extra in extras: data = {"key": extra, "value": None} + val = dcatus_dataset[extra] if extra == "publisher": - data["value"] = dcatus_catalog[extra]["name"] + data["value"] = val["name"] else: - data["value"] = dcatus_catalog[extra] + if isinstance(val, list): # TODO: confirm this is what we want. + val = val[0] + data["value"] = val output.append(data) return output + additions @@ -70,21 +75,28 @@ def get_email_from_str(in_str): return res.group(0) -def create_ckan_resources(dists): +def create_ckan_resources(dcatus_dataset): output = [] - for dist in dists: + if "distribution" not in dcatus_dataset: + return output + + for dist in dcatus_dataset["distribution"]: url_key = "downloadURL" if "downloadURL" in dist else "accessURL" - resource = {"url": dist[url_key], "mimetype": dist["mediaType"]} + resource = {"url": dist[url_key]} + if "mimetype" in dist: + resource["mimetype"] = dist["mediaType"] + output.append(resource) return output -def simple_transform(dcatus_catalog): +def simple_transform(dcatus_dataset): output = { - "name": "-".join(dcatus_catalog["title"].lower().split()), - "owner_org": "test", + "name": "-".join(dcatus_dataset["title"].lower().split()), + "owner_org": "test", # TODO: CHANGE THIS! + "identifier": dcatus_dataset["identifier"], } mapping = { @@ -93,14 +105,17 @@ def simple_transform(dcatus_catalog): "title": "title", } - for k, v in dcatus_catalog.items(): + for k, v in dcatus_dataset.items(): if k not in mapping: continue if isinstance(mapping[k], dict): temp = {} + to_skip = ["@type"] for k2, v2 in v.items(): if k2 == "hasEmail": v2 = get_email_from_str(v2) + if k2 in to_skip: + continue temp[mapping[k][k2]] = v2 output = {**output, **temp} else: @@ -116,7 +131,7 @@ def create_defaults(): } -def dcatus_to_ckan(dcatus_catalog): +def dcatus_to_ckan(dcatus_dataset, harvest_source_name): """ example: - from this: @@ -126,23 +141,34 @@ def dcatus_to_ckan(dcatus_catalog): """ - output = simple_transform(dcatus_catalog) + output = simple_transform(dcatus_dataset) - resources = create_ckan_resources(dcatus_catalog["distribution"]) - tags = create_ckan_tags(dcatus_catalog["keyword"]) - pubisher_hierarchy = create_ckan_publisher_hierarchy(dcatus_catalog["publisher"]) + resources = create_ckan_resources(dcatus_dataset) + tags = create_ckan_tags(dcatus_dataset["keyword"]) + pubisher_hierarchy = create_ckan_publisher_hierarchy( + dcatus_dataset["publisher"], [] + ) extras_base = create_ckan_extra_base( - pubisher_hierarchy, "Dataset", dcatus_catalog["publisher"]["name"] + pubisher_hierarchy, "Dataset", dcatus_dataset["publisher"]["name"] ) - extras = create_ckan_extras_additions(dcatus_catalog, extras_base) + extras = create_ckan_extras_additions(dcatus_dataset, extras_base) defaults = create_defaults() output["resources"] = resources output["tags"] = tags + output["extras"] = extras_base output["extras"] += extras + output["extras"] += [ + { + "key": "dcat_metadata", + "value": str(sort_dataset(dcatus_dataset)), + } + ] + + output["extras"] += [{"key": "harvest_source_name", "value": harvest_source_name}] return {**output, **defaults} @@ -167,3 +193,7 @@ def update_ckan_package(ckan, update_data): def purge_ckan_package(ckan, package_data): return ckan.action.dataset_purge(**package_data) + + +def search_ckan(ckan, query): + return ckan.action.package_search(**query) diff --git a/harvester/utils/__init__.py b/harvester/utils/__init__.py index 5e7c7ea5..7044a5b5 100644 --- a/harvester/utils/__init__.py +++ b/harvester/utils/__init__.py @@ -1,3 +1,3 @@ -from . import json +from . import json, util -__all__ = ["json"] +__all__ = ["json", "util"] diff --git a/harvester/utils/util.py b/harvester/utils/util.py new file mode 100644 index 00000000..81174a6c --- /dev/null +++ b/harvester/utils/util.py @@ -0,0 +1,12 @@ +import hashlib +import json + +import sansjson + + +def sort_dataset(d): + return sansjson.sort_pyobject(d) + + +def dataset_to_hash(d): + return hashlib.sha256(json.dumps(d, sort_keys=True).encode("utf-8")).hexdigest() diff --git a/poetry.lock b/poetry.lock index 70e127a8..437dbbcd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "attrs" @@ -622,6 +622,17 @@ files = [ {file = "ruff-0.0.261.tar.gz", hash = "sha256:c1c715b0d1e18f9c509d7c411ca61da3543a4aa459325b1b1e52b8301d65c6d2"}, ] +[[package]] +name = "sansjson" +version = "0.3.0" +description = "Your friendly neighborhood JSON sorter helper" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sansjson-0.3.0-py3-none-any.whl", hash = "sha256:d7acfc6fdbe1a5cb9ccff21ae114ba8c8d3f081e6884a282d014a5ed5af28958"}, + {file = "sansjson-0.3.0.tar.gz", hash = "sha256:d0dbaf53a2b412e474c58e9097819020aec2c572fb973539f10590e322d2dfd7"}, +] + [[package]] name = "setuptools" version = "69.0.2" @@ -808,4 +819,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = ">=3.10" -content-hash = "05bad374bbff6faf2eb4b3899d561c2e84516799c192ceb398d3ccde90edc41c" +content-hash = "a39784d88a5c1d6d7cf4f9e4b7f0ee98289ccedacc7b9f4e0d4daf72d54a0348" diff --git a/pyproject.toml b/pyproject.toml index 6ca5445b..05eb58f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "datagov-harvesting-logic" -version = "0.0.4" +version = "0.1.0" description = "" # authors = [ # {name = "Jin Sun", email = "jin.sun@gsa.gov"}, @@ -25,6 +25,7 @@ deepdiff = ">=6" pytest = ">=7.3.2" ckanapi = ">=4.7" beautifulsoup4 = "^4.12.2" +sansjson = "^0.3.0" [tool.poetry.group.dev.dependencies] pytest = "^7.3.0" diff --git a/tests/harvest-sources/dcatus/ckan_datasets_resp.json b/tests/harvest-sources/dcatus/ckan_datasets_resp.json new file mode 100644 index 00000000..36138998 --- /dev/null +++ b/tests/harvest-sources/dcatus/ckan_datasets_resp.json @@ -0,0 +1,775 @@ +{ + "help": "https://catalog-dev.data.gov/api/3/action/help_show?name=package_search", + "success": true, + "result": { + "count": 7, + "facets": {}, + "results": [ + { + "author": null, + "author_email": null, + "creator_user_id": "fe35904a-4956-478b-8d2b-3f25e6217600", + "id": "d82bf353-5ed0-4a95-a643-ce15aafefd56", + "isopen": false, + "license_id": null, + "license_title": null, + "maintainer": "Harold W. Hild", + "maintainer_email": "hhild@CFTC.GOV", + "metadata_created": "2024-01-03T17:45:28.179200", + "metadata_modified": "2024-01-03T17:45:28.179206", + "name": "commitment-of-traders", + "notes": "COT reports provide a breakdown of each Tuesday's open interest for futures and options on futures market in which 20 or more traders hold positions equal to or above the reporting levels established by CFTC", + "num_resources": 1, + "num_tags": 3, + "organization": { + "id": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "name": "test", + "title": "test", + "type": "organization", + "description": ".", + "image_url": "", + "created": "2023-04-14T13:44:38.462827", + "is_organization": true, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "private": false, + "state": "active", + "title": "Commitment of Traders", + "type": "dataset", + "url": null, + "version": null, + "extras": [ + { + "key": "publisher_hierarchy", + "value": "U.S. Government > U.S. Commodity Futures Trading Commission" + }, + { "key": "resource-type", "value": "Dataset" }, + { + "key": "publisher", + "value": "U.S. Commodity Futures Trading Commission" + }, + { "key": "accessLevel", "value": "public" }, + { "key": "bureauCode", "value": "339:00" }, + { "key": "identifier", "value": "cftc-dc1" }, + { "key": "modified", "value": "R/P1W" }, + { "key": "programCode", "value": "000:000" }, + { + "key": "dcat_metadata", + "value": "{'accessLevel': 'public', 'bureauCode': ['339:00'], 'contactPoint': {'fn': 'Harold W. Hild', 'hasEmail': 'mailto:hhild@CFTC.GOV'}, 'describedBy': 'https://www.cftc.gov/MarketReports/CommitmentsofTraders/ExplanatoryNotes/index.htm', 'description': \"COT reports provide a breakdown of each Tuesday's open interest for futures and options on futures market in which 20 or more traders hold positions equal to or above the reporting levels established by CFTC\", 'distribution': [{'accessURL': 'https://www.cftc.gov/MarketReports/CommitmentsofTraders/index.htm'}], 'identifier': 'cftc-dc1', 'keyword': ['commitment of traders', 'cot', 'open interest'], 'modified': 'R/P1W', 'programCode': ['000:000'], 'publisher': {'name': 'U.S. Commodity Futures Trading Commission', 'subOrganizationOf': {'name': 'U.S. Government'}}, 'title': 'Commitment of Traders'}" + }, + { + "key": "harvest_source_name", + "value": "test_harvest_source_name" + } + ], + "resources": [ + { + "cache_last_updated": null, + "cache_url": null, + "created": "2024-01-03T17:45:28.183169", + "description": "index.htm", + "format": "HTML", + "hash": "", + "id": "1aaa75c4-08ad-44a8-857c-fda511654832", + "last_modified": null, + "metadata_modified": "2024-01-03T17:45:28.168338", + "mimetype": null, + "mimetype_inner": null, + "name": "Web Page", + "no_real_name": true, + "package_id": "d82bf353-5ed0-4a95-a643-ce15aafefd56", + "position": 0, + "resource_type": null, + "size": null, + "state": "active", + "url": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/index.htm", + "url_type": null + } + ], + "tags": [ + { + "display_name": "commitment-of-traders", + "id": "a2879a8a-4a5f-4ecb-adf9-ffa79c19fa03", + "name": "commitment-of-traders", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "cot", + "id": "a728a985-ddb7-48be-b32f-7cda4a693b29", + "name": "cot", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "open-interest", + "id": "2ecd13e3-8c4d-4bd8-9df6-8ac3db2ca230", + "name": "open-interest", + "state": "active", + "vocabulary_id": null + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + }, + { + "author": null, + "author_email": null, + "creator_user_id": "fe35904a-4956-478b-8d2b-3f25e6217600", + "id": "37452477-7c82-4fac-8363-0062435ffdbd", + "isopen": false, + "license_id": null, + "license_title": null, + "maintainer": "Harold W. Hild", + "maintainer_email": "hhild@CFTC.GOV", + "metadata_created": "2024-01-03T17:45:32.878623", + "metadata_modified": "2024-01-03T17:45:32.878629", + "name": "bank-participation-reports", + "notes": "The Bank Participation Report (BPR), developed by the Division of Market Oversight to provide the U.S. banking authorities and the Bank for International Settlements (BIS, located in Basel, Switzerland) aggregate large-trader positions of banks participating in various financial and non-financial commodity futures.", + "num_resources": 1, + "num_tags": 3, + "organization": { + "id": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "name": "test", + "title": "test", + "type": "organization", + "description": ".", + "image_url": "", + "created": "2023-04-14T13:44:38.462827", + "is_organization": true, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "private": false, + "state": "active", + "title": "Bank Participation Reports", + "type": "dataset", + "url": null, + "version": null, + "extras": [ + { + "key": "publisher_hierarchy", + "value": "U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission" + }, + { "key": "resource-type", "value": "Dataset" }, + { + "key": "publisher", + "value": "U.S. Commodity Futures Trading Commission" + }, + { "key": "accessLevel", "value": "public" }, + { "key": "bureauCode", "value": "339:00" }, + { "key": "identifier", "value": "cftc-dc2" }, + { "key": "modified", "value": "R/P1M" }, + { "key": "programCode", "value": "000:000" }, + { + "key": "dcat_metadata", + "value": "{'accessLevel': 'public', 'bureauCode': ['339:00'], 'contactPoint': {'fn': 'Harold W. Hild', 'hasEmail': 'mailto:hhild@CFTC.GOV'}, 'describedBy': 'https://www.cftc.gov/MarketReports/BankParticipationReports/ExplanatoryNotes/index.htm', 'description': 'The Bank Participation Report (BPR), developed by the Division of Market Oversight to provide the U.S. banking authorities and the Bank for International Settlements (BIS, located in Basel, Switzerland) aggregate large-trader positions of banks participating in various financial and non-financial commodity futures.', 'distribution': [{'accessURL': 'https://www.cftc.gov/MarketReports/BankParticipationReports/index.htm'}], 'identifier': 'cftc-dc2', 'keyword': ['bank participation report', 'banking', 'bpr'], 'modified': 'R/P1M', 'programCode': ['000:000'], 'publisher': {'name': 'U.S. Commodity Futures Trading Commission', 'subOrganizationOf': {'name': 'U.S. Government'}}, 'title': 'Bank Participation Reports'}" + }, + { + "key": "harvest_source_name", + "value": "test_harvest_source_name" + } + ], + "resources": [ + { + "cache_last_updated": null, + "cache_url": null, + "created": "2024-01-03T17:45:32.880277", + "description": "index.htm", + "format": "HTML", + "hash": "", + "id": "d6d227f5-973c-47a8-8e2a-00a9c1846d1a", + "last_modified": null, + "metadata_modified": "2024-01-03T17:45:32.870910", + "mimetype": null, + "mimetype_inner": null, + "name": "Web Page", + "no_real_name": true, + "package_id": "37452477-7c82-4fac-8363-0062435ffdbd", + "position": 0, + "resource_type": null, + "size": null, + "state": "active", + "url": "https://www.cftc.gov/MarketReports/BankParticipationReports/index.htm", + "url_type": null + } + ], + "tags": [ + { + "display_name": "bank-participation-report", + "id": "8f13c9c2-58b6-458f-bdbf-ead4f48db9d4", + "name": "bank-participation-report", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "banking", + "id": "e80c3c36-f71f-4a5d-a141-4460d6739656", + "name": "banking", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "bpr", + "id": "1ce56b61-dcb2-4d4f-a4f3-2f9383a53470", + "name": "bpr", + "state": "active", + "vocabulary_id": null + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + }, + { + "author": null, + "author_email": null, + "creator_user_id": "fe35904a-4956-478b-8d2b-3f25e6217600", + "id": "116078aa-94f0-4965-b0d7-95520d74aa38", + "isopen": false, + "license_id": null, + "license_title": null, + "maintainer": "Harold W. Hild", + "maintainer_email": "hhild@CFTC.gov", + "metadata_created": "2024-01-03T17:45:35.968805", + "metadata_modified": "2024-01-03T17:45:35.968810", + "name": "cotton-on-call", + "notes": "Cotton On-Call Report shows the quantity of call cotton bought or sold on which the price has not been fixed, together with the respective futures on which the purchase or sale is based on.", + "num_resources": 1, + "num_tags": 1, + "organization": { + "id": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "name": "test", + "title": "test", + "type": "organization", + "description": ".", + "image_url": "", + "created": "2023-04-14T13:44:38.462827", + "is_organization": true, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "private": false, + "state": "active", + "title": "Cotton On Call", + "type": "dataset", + "url": null, + "version": null, + "extras": [ + { + "key": "publisher_hierarchy", + "value": "U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission" + }, + { "key": "resource-type", "value": "Dataset" }, + { + "key": "publisher", + "value": "U.S. Commodity Futures Trading Commission" + }, + { "key": "accessLevel", "value": "public" }, + { "key": "bureauCode", "value": "339:00" }, + { "key": "identifier", "value": "cftc-dc3" }, + { "key": "modified", "value": "R/P1W" }, + { "key": "programCode", "value": "000:000" }, + { + "key": "dcat_metadata", + "value": "{'accessLevel': 'public', 'bureauCode': ['339:00'], 'contactPoint': {'fn': 'Harold W. Hild', 'hasEmail': 'mailto:hhild@CFTC.gov'}, 'describedBy': 'https://www.cftc.gov/MarketReports/CottonOnCall/index.htm', 'description': 'Cotton On-Call Report shows the quantity of call cotton bought or sold on which the price has not been fixed, together with the respective futures on which the purchase or sale is based on.', 'distribution': [{'accessURL': 'https://www.cftc.gov/MarketReports/CottonOnCall/index.htm'}], 'identifier': 'cftc-dc3', 'keyword': ['cotton on call', 'cotton on-call'], 'modified': 'R/P1W', 'programCode': ['000:000'], 'publisher': {'name': 'U.S. Commodity Futures Trading Commission', 'subOrganizationOf': {'name': 'U.S. Government'}}, 'title': 'Cotton On Call'}" + }, + { + "key": "harvest_source_name", + "value": "test_harvest_source_name" + } + ], + "resources": [ + { + "cache_last_updated": null, + "cache_url": null, + "created": "2024-01-03T17:45:35.970377", + "description": "index.htm", + "format": "HTML", + "hash": "", + "id": "0fe824a4-43e9-4459-a471-9993f5dc7443", + "last_modified": null, + "metadata_modified": "2024-01-03T17:45:35.964304", + "mimetype": null, + "mimetype_inner": null, + "name": "Web Page", + "no_real_name": true, + "package_id": "116078aa-94f0-4965-b0d7-95520d74aa38", + "position": 0, + "resource_type": null, + "size": null, + "state": "active", + "url": "https://www.cftc.gov/MarketReports/CottonOnCall/index.htm", + "url_type": null + } + ], + "tags": [ + { + "display_name": "cotton-on-call", + "id": "12298da4-748e-4ab2-af27-221145fb517e", + "name": "cotton-on-call", + "state": "active", + "vocabulary_id": null + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + }, + { + "author": null, + "author_email": null, + "creator_user_id": "fe35904a-4956-478b-8d2b-3f25e6217600", + "id": "fa1c3904-373f-4573-aaad-41b8e1de1143", + "isopen": false, + "license_id": null, + "license_title": null, + "maintainer": "Carrie L Coffin", + "maintainer_email": "ccoffin@CFTC.gov", + "metadata_created": "2024-01-03T17:45:39.315356", + "metadata_modified": "2024-01-03T17:45:39.315362", + "name": "financial-data-for-fcms", + "notes": "Futures commission merchants (FCMs) and retail foreign exchange dealers (RFEDs) must file monthly financial reports with the CFTC's Market Participants Division (MPD) within 17 business days after the end of the month. Selected financial information from these reports is published.", + "num_resources": 1, + "num_tags": 6, + "organization": { + "id": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "name": "test", + "title": "test", + "type": "organization", + "description": ".", + "image_url": "", + "created": "2023-04-14T13:44:38.462827", + "is_organization": true, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "private": false, + "state": "active", + "title": "Financial Data for FCMS", + "type": "dataset", + "url": null, + "version": null, + "extras": [ + { + "key": "publisher_hierarchy", + "value": "U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission" + }, + { "key": "resource-type", "value": "Dataset" }, + { + "key": "publisher", + "value": "U.S. Commodity Futures Trading Commission" + }, + { "key": "accessLevel", "value": "public" }, + { "key": "bureauCode", "value": "339:00" }, + { "key": "identifier", "value": "cftc-dc4" }, + { "key": "modified", "value": "R/P1M" }, + { "key": "programCode", "value": "000:000" }, + { + "key": "dcat_metadata", + "value": "{'accessLevel': 'public', 'bureauCode': ['339:00'], 'contactPoint': {'fn': 'Carrie L Coffin', 'hasEmail': 'mailto:ccoffin@CFTC.gov'}, 'describedBy': 'https://www.cftc.gov/MarketReports/financialfcmdata/DescriptionofReportDataFields/indesx.htm', 'description': \"Futures commission merchants (FCMs) and retail foreign exchange dealers (RFEDs) must file monthly financial reports with the CFTC's Market Participants Division (MPD) within 17 business days after the end of the month. Selected financial information from these reports is published.\", 'distribution': [{'accessURL': 'https://www.cftc.gov/MarketReports/financialfcmdata/index.htm'}], 'identifier': 'cftc-dc4', 'keyword': ['dealer', 'fcm', 'futures commission merchant', 'market participants', 'mpd', 'retail foreign exchange'], 'modified': 'R/P1M', 'programCode': ['000:000'], 'publisher': {'name': 'U.S. Commodity Futures Trading Commission', 'subOrganizationOf': {'name': 'U.S. Government'}}, 'title': 'Financial Data for FCMS'}" + }, + { + "key": "harvest_source_name", + "value": "test_harvest_source_name" + } + ], + "resources": [ + { + "cache_last_updated": null, + "cache_url": null, + "created": "2024-01-03T17:45:39.317107", + "description": "index.htm", + "format": "HTML", + "hash": "", + "id": "64f8fbab-8ad1-48f0-adb6-9d8d09ade7e8", + "last_modified": null, + "metadata_modified": "2024-01-03T17:45:39.302853", + "mimetype": null, + "mimetype_inner": null, + "name": "Web Page", + "no_real_name": true, + "package_id": "fa1c3904-373f-4573-aaad-41b8e1de1143", + "position": 0, + "resource_type": null, + "size": null, + "state": "active", + "url": "https://www.cftc.gov/MarketReports/financialfcmdata/index.htm", + "url_type": null + } + ], + "tags": [ + { + "display_name": "dealer", + "id": "d50fff3b-f83e-40f2-a373-ea2d356de9cb", + "name": "dealer", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "fcm", + "id": "fbce5bc9-3400-46ae-a67c-5f74bf54b55a", + "name": "fcm", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "futures-commission-merchant", + "id": "4d24f60d-5eb9-40e9-adcb-0355aa981521", + "name": "futures-commission-merchant", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "market-participants", + "id": "25411659-e740-4ecf-befa-7360bda6d158", + "name": "market-participants", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "mpd", + "id": "56cb8509-dacf-474e-bfc5-02e160edc2c6", + "name": "mpd", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "retail-foreign-exchange", + "id": "3a1c2a0f-63f6-4871-9309-aad1fb088aa5", + "name": "retail-foreign-exchange", + "state": "active", + "vocabulary_id": null + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + }, + { + "author": null, + "author_email": null, + "creator_user_id": "fe35904a-4956-478b-8d2b-3f25e6217600", + "id": "2cbbdabf-1a45-476a-9cf8-168a39d8225f", + "isopen": false, + "license_id": null, + "license_title": null, + "maintainer": "Byung-IL Seo", + "maintainer_email": "BSeo@CFTC.gov", + "metadata_created": "2024-01-03T17:45:42.689760", + "metadata_modified": "2024-01-03T17:45:42.689767", + "name": "net-positions-changes-data", + "notes": "The Large Trader Net Position Changes and the Trading Account Net Position Changes data provides the public with a view of the amount of trading that results in net changes to positions at the trader level and at the account level. The data reflects trading that changes or creates an end-of-day position, as contrasted with trading that does not change a trader\u2019s end-of-day net position, such as spread or day trading.", + "num_resources": 1, + "num_tags": 3, + "organization": { + "id": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "name": "test", + "title": "test", + "type": "organization", + "description": ".", + "image_url": "", + "created": "2023-04-14T13:44:38.462827", + "is_organization": true, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "private": false, + "state": "active", + "title": "Net Positions Changes Data", + "type": "dataset", + "url": null, + "version": null, + "extras": [ + { + "key": "publisher_hierarchy", + "value": "U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission" + }, + { "key": "resource-type", "value": "Dataset" }, + { + "key": "publisher", + "value": "U.S. Commodity Futures Trading Commission" + }, + { "key": "accessLevel", "value": "public" }, + { "key": "bureauCode", "value": "339:00" }, + { "key": "identifier", "value": "cftc-dc5" }, + { "key": "modified", "value": "2011-06-30" }, + { "key": "programCode", "value": "000:000" }, + { + "key": "dcat_metadata", + "value": "{'accessLevel': 'public', 'bureauCode': ['339:00'], 'contactPoint': {'fn': 'Byung-IL Seo', 'hasEmail': 'mailto:BSeo@CFTC.gov'}, 'describedBy': 'https://www.cftc.gov/MarketReports/LgTraderExplanatory.html', 'description': 'The Large Trader Net Position Changes and the Trading Account Net Position Changes data provides the public with a view of the amount of trading that results in net changes to positions at the trader level and at the account level. The data reflects trading that changes or creates an end-of-day position, as contrasted with trading that does not change a trader\u2019s end-of-day net position, such as spread or day trading.', 'distribution': [{'accessURL': 'https://www.cftc.gov/MarketReports/NetPositionChangesData/index.htm'}], 'identifier': 'cftc-dc5', 'keyword': ['larger trader net position', 'net positions', 'trading account net positions'], 'modified': '2011-06-30', 'programCode': ['000:000'], 'publisher': {'name': 'U.S. Commodity Futures Trading Commission', 'subOrganizationOf': {'name': 'U.S. Government'}}, 'title': 'Net Positions Changes Data'}" + }, + { + "key": "harvest_source_name", + "value": "test_harvest_source_name" + } + ], + "resources": [ + { + "cache_last_updated": null, + "cache_url": null, + "created": "2024-01-03T17:45:42.691834", + "description": "index.htm", + "format": "HTML", + "hash": "", + "id": "ad9d9510-37a1-4a48-9f17-9b00ce488323", + "last_modified": null, + "metadata_modified": "2024-01-03T17:45:42.682258", + "mimetype": null, + "mimetype_inner": null, + "name": "Web Page", + "no_real_name": true, + "package_id": "2cbbdabf-1a45-476a-9cf8-168a39d8225f", + "position": 0, + "resource_type": null, + "size": null, + "state": "active", + "url": "https://www.cftc.gov/MarketReports/NetPositionChangesData/index.htm", + "url_type": null + } + ], + "tags": [ + { + "display_name": "larger-trader-net-position", + "id": "0c18826b-244e-4093-9513-d1febab5372d", + "name": "larger-trader-net-position", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "net-positions", + "id": "87b349d9-b4aa-4601-8239-1fba47f50191", + "name": "net-positions", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "trading-account-net-positions", + "id": "c42822f4-78ec-4791-aaad-08de83dfd9ac", + "name": "trading-account-net-positions", + "state": "active", + "vocabulary_id": null + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + }, + { + "author": null, + "author_email": null, + "creator_user_id": "fe35904a-4956-478b-8d2b-3f25e6217600", + "id": "44954eb7-076e-45e6-b8cc-cf9119e04f6d", + "isopen": false, + "license_id": null, + "license_title": null, + "maintainer": "Richard Haynes", + "maintainer_email": "RHaynes@CFTC.gov", + "metadata_created": "2024-01-03T17:45:46.219897", + "metadata_modified": "2024-01-03T17:45:46.219902", + "name": "weekly-swaps-report", + "notes": "The CFTC Swaps Report aggregates a comprehensive body of swap market data that was not previously reported to regulators or regulated entities, and makes that information freely available in a form that is readily usable by both market participants and the general public. The swaps market data included in publications produced by entities such as the BIS, ISDA, and the Office of the Comptroller of the Currency vary in scope and granularity, but none corresponds directly to the data stored in the CFTC's SDRs.", + "num_resources": 1, + "num_tags": 2, + "organization": { + "id": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "name": "test", + "title": "test", + "type": "organization", + "description": ".", + "image_url": "", + "created": "2023-04-14T13:44:38.462827", + "is_organization": true, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "private": false, + "state": "active", + "title": "Weekly Swaps Report", + "type": "dataset", + "url": null, + "version": null, + "extras": [ + { + "key": "publisher_hierarchy", + "value": "U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission" + }, + { "key": "resource-type", "value": "Dataset" }, + { + "key": "publisher", + "value": "U.S. Commodity Futures Trading Commission" + }, + { "key": "accessLevel", "value": "public" }, + { "key": "bureauCode", "value": "339:00" }, + { "key": "identifier", "value": "cftc-dc6" }, + { "key": "modified", "value": "R/P1W" }, + { "key": "programCode", "value": "000:000" }, + { + "key": "dcat_metadata", + "value": "{'accessLevel': 'public', 'bureauCode': ['339:00'], 'contactPoint': {'fn': 'Richard Haynes', 'hasEmail': 'mailto:RHaynes@CFTC.gov'}, 'describedBy': 'https://www.cftc.gov/MarketReports/SwapsReports/DataDictionary/index.htm', 'description': \"The CFTC Swaps Report aggregates a comprehensive body of swap market data that was not previously reported to regulators or regulated entities, and makes that information freely available in a form that is readily usable by both market participants and the general public. The swaps market data included in publications produced by entities such as the BIS, ISDA, and the Office of the Comptroller of the Currency vary in scope and granularity, but none corresponds directly to the data stored in the CFTC's SDRs.\", 'distribution': [{'accessURL': 'https://www.cftc.gov/MarketReports/SwapsReports/index.htm'}], 'identifier': 'cftc-dc6', 'keyword': ['swaps market', 'swaps report'], 'modified': 'R/P1W', 'programCode': ['000:000'], 'publisher': {'name': 'U.S. Commodity Futures Trading Commission', 'subOrganizationOf': {'name': 'U.S. Government'}}, 'title': 'Weekly Swaps Report'}" + }, + { + "key": "harvest_source_name", + "value": "test_harvest_source_name" + } + ], + "resources": [ + { + "cache_last_updated": null, + "cache_url": null, + "created": "2024-01-03T17:45:46.221358", + "description": "index.htm", + "format": "HTML", + "hash": "", + "id": "8d2a2b14-157c-4e5a-b274-365610d4176d", + "last_modified": null, + "metadata_modified": "2024-01-03T17:45:46.214194", + "mimetype": null, + "mimetype_inner": null, + "name": "Web Page", + "no_real_name": true, + "package_id": "44954eb7-076e-45e6-b8cc-cf9119e04f6d", + "position": 0, + "resource_type": null, + "size": null, + "state": "active", + "url": "https://www.cftc.gov/MarketReports/SwapsReports/index.htm", + "url_type": null + } + ], + "tags": [ + { + "display_name": "swaps-market", + "id": "23d2277b-c098-4b34-a5db-f3670863795e", + "name": "swaps-market", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "swaps-report", + "id": "f1f77e07-57f0-4e77-a931-e6822a46ce72", + "name": "swaps-report", + "state": "active", + "vocabulary_id": null + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + }, + { + "author": null, + "author_email": null, + "creator_user_id": "fe35904a-4956-478b-8d2b-3f25e6217600", + "id": "73912fa4-062c-4d0e-899c-3f0924c17552", + "isopen": false, + "license_id": null, + "license_title": null, + "maintainer": "Byung-IL Seo", + "maintainer_email": "BSeo@CFTC.gov", + "metadata_created": "2024-01-03T17:45:50.286869", + "metadata_modified": "2024-01-03T17:45:50.286874", + "name": "cleared-margin-reports", + "notes": "Derivatives clearing organizations (DCOs) are required to file daily reports on initial margin with the CFTC's Division of Clearing and Risk (DCR). Aggregate initial margin summary information for Chicago Mercantile Exchange (CME), ICE Clear Credit (ICC), ICE Clear US (ICUS), ICE Clear Europe (ICEU), LCH Ltd., and LCH SA is published below. The information will generally be updated within ten business days of the end of each month.", + "num_resources": 1, + "num_tags": 2, + "organization": { + "id": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "name": "test", + "title": "test", + "type": "organization", + "description": ".", + "image_url": "", + "created": "2023-04-14T13:44:38.462827", + "is_organization": true, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "1c7ba64d-27d4-458c-bd6a-5a6dcbed5d81", + "private": false, + "state": "active", + "title": "Cleared Margin Reports", + "type": "dataset", + "url": null, + "version": null, + "extras": [ + { + "key": "publisher_hierarchy", + "value": "U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission > U.S. Government > U.S. Commodity Futures Trading Commission" + }, + { "key": "resource-type", "value": "Dataset" }, + { + "key": "publisher", + "value": "U.S. Commodity Futures Trading Commission" + }, + { "key": "accessLevel", "value": "public" }, + { "key": "bureauCode", "value": "339:00" }, + { "key": "identifier", "value": "cftc-dc7" }, + { "key": "modified", "value": "R/P1M" }, + { "key": "programCode", "value": "000:000" }, + { + "key": "dcat_metadata", + "value": "{'accessLevel': 'public', 'bureauCode': ['339:00'], 'contactPoint': {'fn': 'Byung-IL Seo', 'hasEmail': 'mailto:BSeo@CFTC.gov'}, 'describedBy': 'https://www.cftc.gov/MarketReports/ClearedMarginReports/index.htm', 'description': \"Derivatives clearing organizations (DCOs) are required to file daily reports on initial margin with the CFTC's Division of Clearing and Risk (DCR). Aggregate initial margin summary information for Chicago Mercantile Exchange (CME), ICE Clear Credit (ICC), ICE Clear US (ICUS), ICE Clear Europe (ICEU), LCH Ltd., and LCH SA is published below. The information will generally be updated within ten business days of the end of each month.\", 'distribution': [{'accessURL': 'https://www.cftc.gov/MarketReports/ClearedMarginReports/index.htm'}], 'identifier': 'cftc-dc7', 'keyword': ['exchange', 'margin'], 'modified': 'R/P1M', 'programCode': ['000:000'], 'publisher': {'name': 'U.S. Commodity Futures Trading Commission', 'subOrganizationOf': {'name': 'U.S. Government'}}, 'title': 'Cleared Margin Reports'}" + }, + { + "key": "harvest_source_name", + "value": "test_harvest_source_name" + } + ], + "resources": [ + { + "cache_last_updated": null, + "cache_url": null, + "created": "2024-01-03T17:45:50.288362", + "description": "index.htm", + "format": "HTML", + "hash": "", + "id": "30dfcaf9-233e-412b-9f48-3c2552cce409", + "last_modified": null, + "metadata_modified": "2024-01-03T17:45:50.280896", + "mimetype": null, + "mimetype_inner": null, + "name": "Web Page", + "no_real_name": true, + "package_id": "73912fa4-062c-4d0e-899c-3f0924c17552", + "position": 0, + "resource_type": null, + "size": null, + "state": "active", + "url": "https://www.cftc.gov/MarketReports/ClearedMarginReports/index.htm", + "url_type": null + } + ], + "tags": [ + { + "display_name": "exchange", + "id": "bc369889-4ee2-4b6f-ac3e-dad1edd43e7b", + "name": "exchange", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "margin", + "id": "2b36390c-df68-48bc-9219-f3d215363e10", + "name": "margin", + "state": "active", + "vocabulary_id": null + } + ], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } + ], + "sort": "views_recent desc", + "search_facets": {} + } +} diff --git a/tests/harvest-sources/dcatus/dcatus.json b/tests/harvest-sources/dcatus/dcatus.json index 1a52086b..01f89791 100644 --- a/tests/harvest-sources/dcatus/dcatus.json +++ b/tests/harvest-sources/dcatus/dcatus.json @@ -1,301 +1,202 @@ { - "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", - "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", + "@id": "http://www.cftc.gov/data.json", "@type": "dcat:Catalog", + "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", + "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", "dataset": [ { - "@type": "dcat:Dataset", - "title": "2015 GSA Common Baseline Implementation Plan and CIO Assignment Plan", - "description": "This is GSA's 2015 Common Baseline Implementation Plan and its CIO Assignment Plan per the requirements set forth in FITARA legislation.", - "modified": "2017-05-15", - "accessLevel": "public", - "identifier": "GSA-2016-01-22-01", - "dataQuality": true, - "license": "https://creativecommons.org/publicdomain/zero/1.0/", - "publisher": { - "@type": "org:Organization", - "name": "General Services Administration" - }, - "accrualPeriodicity": "R/P1Y", "contactPoint": { - "@type": "vcard:Contact", - "fn": "Mick Harris", - "hasEmail": "mailto:michael.harris@gsa.gov" + "fn": "Harold W. Hild", + "hasEmail": "mailto:hhild@CFTC.GOV" }, + "describedBy": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/ExplanatoryNotes/index.htm", + "description": "COT reports provide a breakdown of each Tuesday's open interest for futures and options on futures market in which 20 or more traders hold positions equal to or above the reporting levels established by CFTC", "distribution": [ { - "@type": "dcat:Distribution", - "mediaType": "application/pdf", - "format": "pdf", - "title": "2015 GSA Common Baseline Implementation Plan and CIO Assignment Plan", - "description": "This is GSA's 2015 Common Baseline Implementation Plan and its CIO Assignment Plan per the requirements set forth in FITARA legislation. Updated April 2017. Last Major Change to version updated on March 4, 2019. Last Major change to version update don 8/5/2020.", - "downloadURL": "https://inventory.data.gov/dataset/64c56cec-4b8f-44c7-ba69-090517f9f32e/resource/87e53999-aff1-4560-8bf0-42d9dc8e4a69/download/2015gsafitaraimplementationandcioassignmentplan.pdf" + "accessURL": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/index.htm" } ], - "keyword": ["Assignment Plan", "CIO", "Common Baseline", "FITARA", "GSA IT", "Implementation Plan"], - "bureauCode": ["023:00"], - "programCode": ["023:000"], - "theme": ["IT Initiatives"] - }, - { - "@type": "dcat:Dataset", - "title": "Concur - Reporting Voucher Model", - "description": "The data dictionary for the reporting voucher model within Concur.", - "modified": "2016-02-23", - "accessLevel": "non-public", - "identifier": "GSA - 139048", - "dataQuality": true, - "license": "http://www.usa.gov/publicdomain/label/1.0/", - "rights": "Trade secrets & commercial/financial info obtained from a person and privileged or confidential.", + "modified": "R/P1W", + "programCode": ["000:000"], "publisher": { - "@type": "org:Organization", - "name": "General Services Administration" - }, - "accrualPeriodicity": "R/P1Y", - "isPartOf": "GSA-2015-09-11-01", - "contactPoint": { - "@type": "vcard:Contact", - "fn": "Norma H Tolson", - "hasEmail": "mailto:norma.tolson@gsa.gov" + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } }, - "keyword": ["Credit Card", "Travel Card"], - "bureauCode": ["023:00"], - "programCode": ["023:010"], - "language": ["en-us"], - "theme": ["Travel and Transportation"] + "title": "Commitment of Traders", + "accessLevel": "public", + "bureauCode": ["339:00"], + "identifier": "cftc-dc1", + "keyword": ["commitment of traders", "cot", "open interest"] }, { - "@type": "dcat:Dataset", - "title": "Concur - Reporting Travel Model", - "description": "The data dictionary for the reporting travel model within Concur.", - "modified": "2016-01-20", - "accessLevel": "non-public", - "identifier": "GSA - 139046", - "dataQuality": true, - "license": "http://www.usa.gov/publicdomain/label/1.0/", - "rights": "Trade secrets & commercial/financial info obtained from a person and privileged or confidential.", - "publisher": { - "@type": "org:Organization", - "name": "General Services Administration" - }, - "accrualPeriodicity": "R/P1Y", - "isPartOf": "GSA-2015-09-11-01", + "accessLevel": "public", + "bureauCode": ["339:00"], "contactPoint": { - "@type": "vcard:Contact", - "fn": "Norma H Tolson", - "hasEmail": "mailto:norma.tolson@gsa.gov" + "fn": "Harold W. Hild", + "hasEmail": "mailto:hhild@CFTC.GOV" }, - "keyword": ["Credit Card", "travel card"], - "bureauCode": ["023:00"], - "programCode": ["023:010"], - "language": ["en-us"], - "theme": ["Travel and Transportation"] - }, - { - "@type": "dcat:Dataset", - "title": "Concur Travel Parent", - "description": "This is the Parent folder for Concur datasets reporting on; Closed-Paid Vouchers in Concur Government Edition (CGE), Authorization Model, Travel Model, User Profile, and Voucher Model.", - "modified": "2016-02-23", - "accessLevel": "non-public", - "identifier": "GSA-2015-09-11-01", - "dataQuality": true, - "license": "http://www.usa.gov/publicdomain/label/1.0/", - "rights": "Trade secrets & commercial/financial info obtained from a person and privileged or confidential.", + "describedBy": "https://www.cftc.gov/MarketReports/BankParticipationReports/ExplanatoryNotes/index.htm", + "description": "The Bank Participation Report (BPR), developed by the Division of Market Oversight to provide the U.S. banking authorities and the Bank for International Settlements (BIS, located in Basel, Switzerland) aggregate large-trader positions of banks participating in various financial and non-financial commodity futures.", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/BankParticipationReports/index.htm" + } + ], + "identifier": "cftc-dc2", + "keyword": ["bank participation report", "bpr", "banking"], + "modified": "R/P1M", + "programCode": ["000:000"], "publisher": { - "@type": "org:Organization", - "name": "General Services Administration" - }, - "accrualPeriodicity": "R/P1Y", - "contactPoint": { - "@type": "vcard:Contact", - "fn": "Norma H Tolson", - "hasEmail": "mailto:norma.tolson@gsa.gov" + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } }, - "keyword": ["Authorization", "Closed", "Concur", "Paid", "Travel", "Voucher"], - "bureauCode": ["023:00"], - "programCode": ["023:010"], - "language": ["en-us"], - "theme": ["Travel and Transportation"] + "title": "Bank Participation Reports" }, { - "@type": "dcat:Dataset", - "title": "Data.gov Daily Sessions", - "description": "Data.gov Daily Sessions 20120101-20151231", - "modified": "2016-08-01", "accessLevel": "public", - "identifier": "GSA - DATA.GOVMETRICS1", - "license": "https://creativecommons.org/publicdomain/zero/1.0/", - "publisher": { - "@type": "org:Organization", - "name": "General Services Administration" - }, - "isPartOf": "GSA-2015-09-14-01", + "bureauCode": ["339:00"], "contactPoint": { - "@type": "vcard:Contact", - "fn": "Hyon Joo Kim", - "hasEmail": "mailto:hyon.kim@gsa.gov" + "fn": "Harold W. Hild", + "hasEmail": "mailto:hhild@CFTC.gov" }, + "describedBy": "https://www.cftc.gov/MarketReports/CottonOnCall/index.htm", + "description": "Cotton On-Call Report shows the quantity of call cotton bought or sold on which the price has not been fixed, together with the respective futures on which the purchase or sale is based on.", "distribution": [ { - "@type": "dcat:Distribution", - "mediaType": "text/csv", - "title": "Data.gov Daily Sessions", - "description": "Data.gov Daily Sessions 20120101-20151231", - "downloadURL": "https://inventory.data.gov/dataset/e0de4198-eaaa-423a-9154-7af76ab8d822/resource/a78ae43e-0ceb-4f2d-83ac-99c61b249afa/download/analytics-www.data.gov-data.gov-daily-sessions-20120101-20151231-analytics-www.data.gov-data.g.csv" - }, - { - "@type": "dcat:Distribution", - "mediaType": "text/csv", - "title": "Data.gov Daily Sessions Day Index", - "description": "Data.gov Daily Sessions 20120101-20151231 Day Index", - "downloadURL": "https://inventory.data.gov/dataset/e0de4198-eaaa-423a-9154-7af76ab8d822/resource/925f6204-5240-4dd1-8604-c279843c8974/download/analytics-www.data.gov-data.gov-daily-sessions-20120101-20151231b-sheet1.csv" + "accessURL": "https://www.cftc.gov/MarketReports/CottonOnCall/index.htm" } ], - "keyword": ["Data.gov Analytics", "metrics"], - "bureauCode": ["023:00"], - "programCode": ["023:019"] - }, - { - "@type": "dcat:Dataset", - "title": "Data.gov Statistics Parent", - "description": "Various reports regarding the Data.gov sites, from Daily Visitors, to Top 10 Countries, and States.", - "modified": "2015-09-14", - "accessLevel": "public", - "identifier": "GSA-2015-09-14-01", - "dataQuality": true, - "issued": "2013-04-11", - "license": "https://creativecommons.org/publicdomain/zero/1.0/", - "spatial": "Worldwide", + "identifier": "cftc-dc3", + "keyword": ["cotton on call", "cotton on-call"], + "modified": "R/P1W", + "programCode": ["000:000"], "publisher": { - "@type": "org:Organization", - "name": "General Services Administration" - }, - "contactPoint": { - "@type": "vcard:Contact", - "fn": "Hyon Joo Kim", - "hasEmail": "mailto:hyon.kim@gsa.gov" + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } }, - "keyword": ["Countries", "States", "Statistics", "Visitors", "data.gov"], - "bureauCode": ["023:00"], - "programCode": ["023:019"], - "language": ["en-us"], - "theme": ["Data.gov Site"] + "title": "Cotton On Call" }, { - "@type": "dcat:Dataset", "accessLevel": "public", - "bureauCode": ["010:12"], - "programCode": ["010:012"], + "bureauCode": ["339:00"], "contactPoint": { - "@type": "vcard:Contact", - "fn": "David C. Twichell", - "hasEmail": "mailto:dtwichell@usgs.gov" + "fn": "Carrie L Coffin", + "hasEmail": "mailto:ccoffin@CFTC.gov" }, - "description": "In 2010, the U.S. Geological Survey in Woods Hole, MA and St. Petersburg, FL, in partnership with the U.S. Army Corps of Engineers, Mobile District conducted geologic mapping to characterize the seafloor and shallow subsurface stratigraphy offshore of the Gulf Islands of Mississippi. The mapping was carried out during two cruises in March, 2010 on the R/V Tommy Munro of Biloxi, MS. Data were acquired with the following equipment: an SEA Ltd SwathPlus interferometric sonar (both 234 kHz and 468 kHz systems), a Klein 3000 and a Klein 3900 dual frequency sidescan-sonar, and an Edgetech 512i chirp subbottom profiling system. The long-term goal of this mapping effort is to produce high-quality, high-resolution geologic maps and geophysical interpretations that can be utilized to identify sand resources within the region and better understand the Holocene evolution and anticipate future changes in this coastal system. More information on the field work can be accessed from the Woods Hole Coastal and Marine Science Center Field Activity webpage https://cmgds.marine.usgs.gov/fan_info.php?fan=2010-012-FA or the St. Petersburg Coastal and Marine Geology InfoBank https://walrus.wr.usgs.gov/infobank/m/m210gm/html/m-2-10-gm.meta.html.", + "describedBy": "https://www.cftc.gov/MarketReports/financialfcmdata/DescriptionofReportDataFields/indesx.htm", + "description": "Futures commission merchants (FCMs) and retail foreign exchange dealers (RFEDs) must file monthly financial reports with the CFTC's Market Participants Division (MPD) within 17 business days after the end of the month. Selected financial information from these reports is published.", "distribution": [ { - "@type": "dcat:Distribution", - "conformsTo": "https://www.fgdc.gov/schemas/metadata/", - "description": "The metadata original format", - "downloadURL": "https://data.usgs.gov/datacatalog/metadata/USGS.0000a76f-c6be-4366-8be3-6f8487442e8a.xml", - "format": "XML", - "mediaType": "text/xml", - "title": "Original Metadata" - }, - { - "@type": "dcat:Distribution", - "accessURL": "https://doi.org/10.5066/P9KM5FT2", - "description": "Landing page for access to the data", - "format": "XML", - "mediaType": "application/http", - "title": "Digital Data" + "accessURL": "https://www.cftc.gov/MarketReports/financialfcmdata/index.htm" } ], - "identifier": "USGS:0000a76f-c6be-4366-8be3-6f8487442e8a", + "identifier": "cftc-dc4", "keyword": [ - "USGS:0000a76f-c6be-4366-8be3-6f8487442e8a", - "U.S. Geological Survey", - "USGS", - "Woods Hole Coastal and Marine Science Center", - "WHCMSC", - "St. Petersburg Coastal and Marine Science Center", - "Coastal and Marine Geology Program", - "CMGP", - "Global Positioning", - "Navigation", - "Hypack Hydrographic Survey Software", - "R/V Tommy Munro", - "St. Petersburg field activity serial number 10cct02", - "oceans", - "location", - "navigational data", - "marine geophysics", - "Mississippi", - "Gulf Islands", - "North Central Gulf Coast", - "United States", - "West Ship Island", - "East Ship Island", - "Horn Island", - "Dog Key Pass", - "Camille Cut", - "Petit Bois Island", - "Gulfport Ship Channel", - "Gulf of Mexico", - "Cat Island" + "fcm", + "retail foreign exchange", + "dealer", + "market participants", + "mpd", + "futures commission merchant" ], - "modified": "20200908", "publisher": { - "@type": "org:Organization", - "name": "U.S. Geological Survey", + "name": "U.S. Commodity Futures Trading Commission", "subOrganizationOf": { - "@type": "org:Organization", - "name": "Department of the Interior" + "name": "U.S. Government" } }, - "spatial": "-179.231086,-14.601813,+179.859681,+71.441059", - "theme": ["geospatial"], - "title": "Raw HYPACK navigation logs (text) collected by the U.S. Geological Survey - St. Petersburg Coastal and Marine Science Center offshore of the Gulf Islands, MS, 2010" + "title": "Financial Data for FCMS", + "modified": "R/P1M", + "programCode": ["000:000"] }, { - "@type": "dcat:Dataset", - "title": "ConformsTo ISO Example: TIGER/Line Shapefile, 2013, nation, U.S., Current County and Equivalent National Shapefile", - "description": "The TIGER/Line shapefiles and related database files (.dbf) are an extract of selected geographic and cartographic information from the U.S. Census Bureau's Master Address File / Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) Database (MTDB). The MTDB represents a seamless national file with no overlaps or gaps between parts, however, each TIGER/Line shapefile is designed to stand alone as an independent data set, or they can be combined to cover the entire nation. The primary legal divisions of most states are termed counties. In Louisiana, these divisions are known as parishes. In Alaska, which has no counties, the equivalent entities are the organized boroughs, city and boroughs, municipalities, and for the unorganized area, census areas. The latter are delineated cooperatively for statistical purposes by the State of Alaska and the Census Bureau. In four states (Maryland, Missouri, Nevada, and Virginia), there are one or more incorporated places that are independent of any county organization and thus constitute primary divisions of their states. These incorporated places are known as independent cities and are treated as equivalent entities for purposes of data presentation. The District of Columbia and Guam have no primary divisions, and each area is considered an equivalent entity for purposes of data presentation. The Census Bureau treats the following entities as equivalents of counties for purposes of data presentation: Municipios in Puerto Rico, Districts and Islands in American Samoa, Municipalities in the Commonwealth of the Northern Mariana Islands, and Islands in the U.S. Virgin Islands. The entire area of the United States, Puerto Rico, and the Island Areas is covered by counties or equivalent entities. The boundaries for counties and equivalent entities are as of January 1of the shapefile release year, primarily as reported through the Census Bureau's Boundary and Annexation Survey (BAS).", - "modified": "2013", - "bureauCode": ["006:07"], - "programCode": ["006:012"], - "keyword": ["Nation", "Polygon", "United States", "U.S."], - "theme": ["geospatial"], - "identifier": "tl_2013_us_county.shp.xml", + "identifier": "cftc-dc5", + "keyword": [ + "net positions", + "larger trader net position", + "trading account net positions" + ], "accessLevel": "public", - "spatial": "[[-14.601813, -179.231086], [71.441059, 179.859681]]", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Byung-IL Seo", + "hasEmail": "mailto:BSeo@CFTC.gov" + }, + "describedBy": "https://www.cftc.gov/MarketReports/LgTraderExplanatory.html", + "description": "The Large Trader Net Position Changes and the Trading Account Net Position Changes data provides the public with a view of the amount of trading that results in net changes to positions at the trader level and at the account level. The data reflects trading that changes or creates an end-of-day position, as contrasted with trading that does not change a trader’s end-of-day net position, such as spread or day trading.", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/NetPositionChangesData/index.htm" + } + ], + "modified": "2011-06-30", + "programCode": ["000:000"], "publisher": { - "@type": "org:Organization", - "name": "U.S. Department of Commerce, U.S. Census Bureau, Geography Division" + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } }, + "title": "Net Positions Changes Data" + }, + { + "accessLevel": "public", + "bureauCode": ["339:00"], "contactPoint": { - "@type": "vcard:Contact", - "fn": "pointOfContact - U.S. Department of Commerce, U.S. Census Bureau, Geography Division", - "hasEmail": "mailto:ask@census.gov" + "fn": "Richard Haynes", + "hasEmail": "mailto:RHaynes@CFTC.gov" }, + "describedBy": "https://www.cftc.gov/MarketReports/SwapsReports/DataDictionary/index.htm", + "description": "The CFTC Swaps Report aggregates a comprehensive body of swap market data that was not previously reported to regulators or regulated entities, and makes that information freely available in a form that is readily usable by both market participants and the general public. The swaps market data included in publications produced by entities such as the BIS, ISDA, and the Office of the Comptroller of the Currency vary in scope and granularity, but none corresponds directly to the data stored in the CFTC's SDRs.", "distribution": [ { - "@type": "dcat:Distribution", - "title": "Census Tiger County Lines for 2013", - "downloadURL": "http://www2.census.gov/geo/tiger/TIGER2013/COUNTY/tl_2013_us_county.zip", - "mediaType": "application/zip" - }, + "accessURL": "https://www.cftc.gov/MarketReports/SwapsReports/index.htm" + } + ], + "title": "Weekly Swaps Report", + "identifier": "cftc-dc6", + "keyword": ["swaps report", "swaps market"], + "modified": "R/P1W", + "programCode": ["000:000"], + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } + } + }, + { + "accessLevel": "public", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Byung-IL Seo", + "hasEmail": "mailto:BSeo@CFTC.gov" + }, + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } + }, + "title": "Cleared Margin Reports", + "describedBy": "https://www.cftc.gov/MarketReports/ClearedMarginReports/index.htm", + "description": "Derivatives clearing organizations (DCOs) are required to file daily reports on initial margin with the CFTC's Division of Clearing and Risk (DCR). Aggregate initial margin summary information for Chicago Mercantile Exchange (CME), ICE Clear Credit (ICC), ICE Clear US (ICUS), ICE Clear Europe (ICEU), LCH Ltd., and LCH SA is published below. The information will generally be updated within ten business days of the end of each month.", + "distribution": [ { - "@type": "dcat:Distribution", - "title": "[Anything valid here] Original Metadata", - "downloadURL": "https://meta.geo.census.gov/data/existing/decennial/GEO/GPMB/TIGERline/TIGER2013/county/tl_2013_us_county.shp.iso.xml", - "conformsTo": "http://www.isotc211.org/2005/gmi", - "description": "[Not required] The metadata original format", - "mediaType": "text/xml", - "format": "XML" + "accessURL": "https://www.cftc.gov/MarketReports/ClearedMarginReports/index.htm" } - ] + ], + "identifier": "cftc-dc7", + "keyword": ["margin", "exchange"], + "modified": "R/P1M", + "programCode": ["000:000"] } ] } diff --git a/tests/harvest-sources/dcatus/dcatus_compare.json b/tests/harvest-sources/dcatus/dcatus_compare.json new file mode 100644 index 00000000..f2978071 --- /dev/null +++ b/tests/harvest-sources/dcatus/dcatus_compare.json @@ -0,0 +1,202 @@ +{ + "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", + "@id": "http://www.cftc.gov/data.json", + "@type": "dcat:Catalog", + "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", + "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", + "dataset": [ + { + "accessLevel": "public", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Harold W. Hild", + "hasEmail": "mailto:hhild@CFTC.GOV" + }, + "describedBy": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/ExplanatoryNotes/index.htm", + "description": "COT reports provide a breakdown of each Tuesday's open interest for futures and options on futures market in which 20 or more traders hold positions equal to or above the reporting levels established by CFTC", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/CommitmentsofTraders/index.htm" + } + ], + "identifier": "cftc-dc10", + "keyword": ["commitment of traders", "cot", "open interest"], + "modified": "R/P1W", + "programCode": ["000:000"], + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } + }, + "title": "Commitment of Traders" + }, + { + "accessLevel": "public", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Harold W. Hild", + "hasEmail": "mailto:hhild@CFTC.GOV" + }, + "describedBy": "https://www.cftc.gov/MarketReports/BankParticipationReports/ExplanatoryNotes/index.htm", + "description": "The Bank Participation Report (BPR), developed by the Division of Market Oversight to provide the U.S. banking authorities and the Bank for International Settlements (BIS, located in Basel, Switzerland) aggregate large-trader positions of banks participating in various financial and non-financial commodity futures.", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/BankParticipationReports/index.htm" + } + ], + "identifier": "cftc-dc2", + "keyword": ["bank participation report", "bpr", "banking"], + "modified": "R/P1M Update", + "programCode": ["000:000"], + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } + }, + "title": "Bank Participation Reports" + }, + { + "accessLevel": "public", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Harold W. Hild", + "hasEmail": "mailto:hhild@CFTC.gov" + }, + "describedBy": "https://www.cftc.gov/MarketReports/CottonOnCall/index.htm", + "description": "Cotton On-Call Report shows the quantity of call cotton bought or sold on which the price has not been fixed, together with the respective futures on which the purchase or sale is based on.", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/CottonOnCall/index.htm" + } + ], + "identifier": "cftc-dc3", + "keyword": ["cotton on call", "cotton on-call", "update keyword"], + "modified": "R/P1W", + "programCode": ["000:000"], + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } + }, + "title": "Cotton On Call" + }, + { + "accessLevel": "public", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Carrie L Coffin", + "hasEmail": "mailto:ccoffin@CFTC.gov" + }, + "describedBy": "https://www.cftc.gov/MarketReports/financialfcmdata/DescriptionofReportDataFields/indesx.htm", + "description": "Futures commission merchants (FCMs) and retail foreign exchange dealers (RFEDs) must file monthly financial reports with the CFTC's Market Participants Division (MPD) within 17 business days after the end of the month. Selected financial information from these reports is published.", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/financialfcmdata/index.htm" + } + ], + "identifier": "cftc-dc4", + "keyword": [ + "fcm", + "retail foreign exchange", + "dealer", + "market participants", + "mpd", + "futures commission merchant" + ], + "modified": "R/P1M", + "programCode": ["000:000"], + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "Changed Value" + } + }, + "title": "Financial Data for FCMS" + }, + { + "accessLevel": "public", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Byung-IL Seo", + "hasEmail": "mailto:BSeo@CFTC.gov" + }, + "describedBy": "https://www.cftc.gov/MarketReports/LgTraderExplanatory.html", + "description": "The Large Trader Net Position Changes and the Trading Account Net Position Changes data provides the public with a view of the amount of trading that results in net changes to positions at the trader level and at the account level. The data reflects trading that changes or creates an end-of-day position, as contrasted with trading that does not change a trader’s end-of-day net position, such as spread or day trading.", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/NetPositionChangesData/index.htm" + } + ], + "identifier": "cftc-dc5", + "keyword": [ + "net positions", + "larger trader net position", + "trading account net positions" + ], + "modified": "2011-06-30", + "programCode": ["000:000"], + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } + }, + "title": "Net Positions Changes Data" + }, + { + "accessLevel": "public", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Richard Haynes", + "hasEmail": "mailto:RHaynes@CFTC.gov" + }, + "describedBy": "https://www.cftc.gov/MarketReports/SwapsReports/DataDictionary/index.htm", + "description": "The CFTC Swaps Report aggregates a comprehensive body of swap market data that was not previously reported to regulators or regulated entities, and makes that information freely available in a form that is readily usable by both market participants and the general public. The swaps market data included in publications produced by entities such as the BIS, ISDA, and the Office of the Comptroller of the Currency vary in scope and granularity, but none corresponds directly to the data stored in the CFTC's SDRs.", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/SwapsReports/index.htm" + } + ], + "identifier": "cftc-dc6", + "keyword": ["swaps report", "swaps market"], + "modified": "R/P1W", + "programCode": ["000:000"], + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } + }, + "title": "Weekly Swaps Report" + }, + { + "accessLevel": "public", + "bureauCode": ["339:00"], + "contactPoint": { + "fn": "Byung-IL Seo", + "hasEmail": "mailto:BSeo@CFTC.gov" + }, + "describedBy": "https://www.cftc.gov/MarketReports/ClearedMarginReports/index.htm", + "description": "Derivatives clearing organizations (DCOs) are required to file daily reports on initial margin with the CFTC's Division of Clearing and Risk (DCR). Aggregate initial margin summary information for Chicago Mercantile Exchange (CME), ICE Clear Credit (ICC), ICE Clear US (ICUS), ICE Clear Europe (ICEU), LCH Ltd., and LCH SA is published below. The information will generally be updated within ten business days of the end of each month.", + "distribution": [ + { + "accessURL": "https://www.cftc.gov/MarketReports/ClearedMarginReports/index.htm" + } + ], + "identifier": "cftc-dc7", + "keyword": ["margin", "exchange"], + "modified": "R/P1M", + "programCode": ["000:000"], + "publisher": { + "name": "U.S. Commodity Futures Trading Commission", + "subOrganizationOf": { + "name": "U.S. Government" + } + }, + "title": "Cleared Margin Reports" + } + ] +} diff --git a/tests/integration/compare/conftest.py b/tests/integration/compare/conftest.py new file mode 100644 index 00000000..b9114a5b --- /dev/null +++ b/tests/integration/compare/conftest.py @@ -0,0 +1,56 @@ +import os +from pathlib import Path + +import pytest + +from harvester.load import create_ckan_entrypoint, search_ckan +from harvester.utils.json import open_json +from harvester.utils.util import dataset_to_hash, sort_dataset + +TEST_DIR = Path(__file__).parents[2] +HARVEST_SOURCES = TEST_DIR / "harvest-sources" + + +@pytest.fixture +def ckan_entrypoint(): + catalog_dev_api_key = os.getenv("CKAN_API_TOKEN_DEV") # gha + if catalog_dev_api_key is None: # local + import credentials + + catalog_dev_api_key = credentials.ckan_catalog_dev_api_key + + return create_ckan_entrypoint("https://catalog-dev.data.gov/", catalog_dev_api_key) + + +@pytest.fixture +def data_sources(ckan_entrypoint): + harvest_source_datasets = open_json( + HARVEST_SOURCES / "dcatus" / "dcatus_compare.json" + )["dataset"] + + harvest_source = {} + for d in harvest_source_datasets: + harvest_source[d["identifier"]] = dataset_to_hash( + sort_dataset(d) + ) # the extract needs to be sorted + + ckan_source_datasets = search_ckan( + ckan_entrypoint, + { + "q": 'harvest_source_name:"test_harvest_source_name"', + "fl": [ + "extras_harvest_source_name", + "extras_dcat_metadata", + "extras_identifier", + ], + }, + )["results"] + + ckan_source = {} + + for d in ckan_source_datasets: + ckan_source[d["identifier"]] = dataset_to_hash( + eval(d["dcat_metadata"], {"__builtins__": {}}) + ) # the response is stored sorted + + return harvest_source, ckan_source diff --git a/tests/integration/compare/test_compare_int.py b/tests/integration/compare/test_compare_int.py new file mode 100644 index 00000000..4c65d917 --- /dev/null +++ b/tests/integration/compare/test_compare_int.py @@ -0,0 +1,9 @@ +from harvester.compare import compare + + +def test_compare(data_sources): + compare_res = compare(*data_sources) + + assert len(compare_res["create"]) == 1 + assert len(compare_res["update"]) == 3 + assert len(compare_res["delete"]) == 1 diff --git a/tests/integration/load/ckan/test_ckan_cud_int.py b/tests/integration/load/ckan/test_ckan_cud_int.py index 3d0dffac..a7f25bad 100644 --- a/tests/integration/load/ckan/test_ckan_cud_int.py +++ b/tests/integration/load/ckan/test_ckan_cud_int.py @@ -1,9 +1,5 @@ -from harvester.load import ( - create_ckan_package, - patch_ckan_package, - purge_ckan_package, - update_ckan_package, -) +from harvester.load import (create_ckan_package, patch_ckan_package, + purge_ckan_package, update_ckan_package) def test_create_package(ckan_entrypoint, test_ckan_package): diff --git a/tests/unit/compare/conftest.py b/tests/unit/compare/conftest.py new file mode 100644 index 00000000..2ccd15cb --- /dev/null +++ b/tests/unit/compare/conftest.py @@ -0,0 +1,90 @@ +from pathlib import Path + +import pytest + +from harvester.utils.json import open_json +from harvester.utils.util import dataset_to_hash, sort_dataset + +TEST_DIR = Path(__file__).parents[2] +HARVEST_SOURCES = TEST_DIR / "harvest-sources" + + +@pytest.fixture +def artificial_data_sources(): + # key = dataset identifier + # value = hash value of the dataset + harvest_source = { + "1": "de955c1b-fa16-4b84-ad6c-f891ba276056", # update + "2": "6d500ebc-19f8-4541-82b0-f02ad24c82e3", # do nothing + "3": "9aeef506-fbc4-42e4-ad27-c2e7e9f0d1c5", # create + } + + ckan_source = { + "1": "fcd3428b-0ba7-48da-951d-fe44606be556", + "2": "6d500ebc-19f8-4541-82b0-f02ad24c82e3", + "4": "dae9b42c-cfc5-4f71-ae97-a5b75234b14f", # delete + } + + return harvest_source, ckan_source + + +@pytest.fixture +def data_sources(): + harvest_source_datasets = open_json( + HARVEST_SOURCES / "dcatus" / "dcatus_compare.json" + )["dataset"] + + harvest_source = {} + for d in harvest_source_datasets: + harvest_source[d["identifier"]] = dataset_to_hash( + sort_dataset(d) + ) # the extract needs to be sorted + + ckan_source_datasets = open_json( + HARVEST_SOURCES / "dcatus" / "ckan_datasets_resp.json" + )["result"]["results"] + + ckan_source = {} + + for d in ckan_source_datasets: + orig_meta = None + orig_id = None + for e in d["extras"]: + if e["key"] == "dcat_metadata": + orig_meta = eval(e["value"], {"__builtins__": {}}) + if e["key"] == "identifier": + orig_id = e["value"] + + ckan_source[orig_id] = dataset_to_hash( + orig_meta + ) # the response is stored sorted + + return harvest_source, ckan_source + + +@pytest.fixture +def data_sources_raw(): + harvest_source_datasets = open_json( + HARVEST_SOURCES / "dcatus" / "dcatus_compare.json" + )["dataset"] + + harvest_source = {d["identifier"]: d for d in harvest_source_datasets} + + ckan_source_datasets = open_json( + HARVEST_SOURCES / "dcatus" / "ckan_datasets_resp.json" + )["result"]["results"] + + ckan_source = {} + + for d in ckan_source_datasets: + orig_meta = None + orig_id = None + for e in d["extras"]: + if e["key"] == "dcat_metadata": + orig_meta = eval(e["value"], {"__builtins__": {}}) + if e["key"] == "identifier": + orig_id = e["value"] + + ckan_source[orig_id] = orig_meta + + return harvest_source, ckan_source diff --git a/tests/unit/compare/test_compare.py b/tests/unit/compare/test_compare.py index b7a0560c..84bca11d 100644 --- a/tests/unit/compare/test_compare.py +++ b/tests/unit/compare/test_compare.py @@ -1,11 +1,49 @@ from harvester.compare import compare +from harvester.utils.util import dataset_to_hash, sort_dataset -def test_compare(): - """tests compare""" +def test_artificial_compare(artificial_data_sources): + """tests artificial datasets compare""" - # stub, TODO complete - test_compare = "some test messsage" - compare_response = compare(test_compare) + compare_res = compare(*artificial_data_sources) - assert test_compare == compare_response + assert len(compare_res["create"]) == 1 + assert len(compare_res["update"]) == 1 + assert len(compare_res["delete"]) == 1 + + +def test_compare(data_sources): + compare_res = compare(*data_sources) + + assert len(compare_res["create"]) == 1 + assert len(compare_res["update"]) == 3 + assert len(compare_res["delete"]) == 1 + + +def test_sort(data_sources_raw): + harvest_source, ckan_source = data_sources_raw + + harvest_source_no_sort = harvest_source.copy() + for k, v in harvest_source_no_sort.items(): + harvest_source_no_sort[k] = dataset_to_hash(v) + + for k, v in ckan_source.items(): + ckan_source[k] = dataset_to_hash(v) + + compare_res_no_sort = compare(harvest_source_no_sort, ckan_source) + + # more datasets need to be updated simply because we didn't sort them + assert len(compare_res_no_sort["create"]) == 1 + assert len(compare_res_no_sort["update"]) == 6 + assert len(compare_res_no_sort["delete"]) == 1 + + harvest_source_with_sort = harvest_source.copy() + for k, v in harvest_source_with_sort.items(): + harvest_source_with_sort[k] = dataset_to_hash(sort_dataset(v)) + + compare_res = compare(harvest_source_with_sort, ckan_source) + + # applying the sort lowers us back down to what we expect. + assert len(compare_res["create"]) == 1 + assert len(compare_res["update"]) == 3 + assert len(compare_res["delete"]) == 1 diff --git a/tests/unit/load/ckan/conftest.py b/tests/unit/load/ckan/conftest.py index 57bdcb87..5eab4a22 100644 --- a/tests/unit/load/ckan/conftest.py +++ b/tests/unit/load/ckan/conftest.py @@ -33,7 +33,7 @@ def test_dcatus_catalog(): @pytest.fixture def test_ckan_package(test_ckan_package_id, test_dcatus_catalog): - ckan_dataset = dcatus_to_ckan(test_dcatus_catalog) + ckan_dataset = dcatus_to_ckan(test_dcatus_catalog, "test_harvest_source_name") ckan_dataset["id"] = test_ckan_package_id return ckan_dataset @@ -51,3 +51,21 @@ def test_ckan_patch_package(test_ckan_package_id): @pytest.fixture def test_ckan_purge_package(test_ckan_package_id): return {"id": test_ckan_package_id} + + +@pytest.fixture +def test_ckan_transform_catalog(): + return { + "identifier": "test identifier", + "contactPoint": {"fn": "Bob Smith", "hasEmail": "bob.smith@example.com"}, + "description": "test description", + "title": "test title", + } + + +@pytest.fixture +def test_ckan_publisher(): + return { + "name": "U.S. Test Organization of the Tests", + "subOrganizationOf": {"name": "Test Incorporated"}, + } diff --git a/tests/unit/load/ckan/test_ckan_cud.py b/tests/unit/load/ckan/test_ckan_cud.py index 5f5e4415..fb796467 100644 --- a/tests/unit/load/ckan/test_ckan_cud.py +++ b/tests/unit/load/ckan/test_ckan_cud.py @@ -1,66 +1,8 @@ from unittest.mock import patch -from deepdiff import DeepDiff - import harvester -def test_dcatus_to_ckan_transform(test_dcatus_catalog): - expected_result = { - "name": "fdic-failed-bank-list", - "owner_org": "test", - "maintainer": "FDIC Public Data Feedback", - "maintainer_email": "FDICPublicDataFeedback@fdic.gov", - "notes": "The FDIC is often appointed as receiver for failed banks. This list includes banks which have failed since October 1, 2000.", # noqa E501 - "title": "FDIC Failed Bank List", - "resources": [ - { - "url": "https://www.fdic.gov/bank/individual/failed/banklist.csv", - "mimetype": "text/csv", - }, - { - "url": "https://www.fdic.gov/bank/individual/failed/index.html", - "mimetype": "text/html", - }, - ], - "tags": [ - {"name": "financial-institution"}, - {"name": "banks"}, - {"name": "failures"}, - {"name": "assistance-transactions"}, - ], - "extras": [ - { - "key": "publisher_hierarchy", - "value": "U.S. Government > Federal Deposit Insurance Corporation > Division of Insurance and Research", # noqa E501 - }, - {"key": "resource-type", "value": "Dataset"}, - {"key": "publisher", "value": "Division of Insurance and Research"}, - {"key": "accessLevel", "value": "public"}, - {"key": "bureauCode", "value": ["357:20"]}, - { - "key": "identifier", - "value": "https://www.fdic.gov/bank/individual/failed/", - }, - {"key": "modified", "value": "R/P1W"}, - {"key": "programCode", "value": ["000:000"]}, - {"key": "publisher", "value": "Division of Insurance and Research"}, - { - "key": "publisher_hierarchy", - "value": "U.S. Government > Federal Deposit Insurance Corporation > Division of Insurance and Research", # noqa E501 - }, - {"key": "resource-type", "value": "Dataset"}, - {"key": "publisher", "value": "Division of Insurance and Research"}, - ], - "author": None, - "author_email": None, - } - - assert ( - DeepDiff(harvester.dcatus_to_ckan(test_dcatus_catalog), expected_result) == {} - ) - - @patch("harvester.create_ckan_package") def test_create_package(mock_create_ckan_package, ckan_entrypoint, test_ckan_package): mock_create_ckan_package.return_value = test_ckan_package.copy() diff --git a/tests/unit/load/ckan/test_ckan_transform.py b/tests/unit/load/ckan/test_ckan_transform.py new file mode 100644 index 00000000..95b2f50c --- /dev/null +++ b/tests/unit/load/ckan/test_ckan_transform.py @@ -0,0 +1,89 @@ +from harvester.load import ( + simple_transform, + create_ckan_publisher_hierarchy, + dcatus_to_ckan, +) +from deepdiff import DeepDiff + + +def test_simple_transform(test_ckan_transform_catalog): + expected_result = { + "name": "test-title", + "owner_org": "test", + "identifier": "test identifier", + "maintainer": "Bob Smith", + "maintainer_email": "bob.smith@example.com", + "notes": "test description", + "title": "test title", + } + + res = simple_transform(test_ckan_transform_catalog) + assert res == expected_result + + +def test_publisher_name(test_ckan_publisher): + res = create_ckan_publisher_hierarchy(test_ckan_publisher, []) + assert res == "Test Incorporated > U.S. Test Organization of the Tests" + + +def test_dcatus_to_ckan_transform(test_dcatus_catalog): + # ruff: noqa: E501 + expected_result = { + "name": "fdic-failed-bank-list", + "owner_org": "test", + "identifier": "https://www.fdic.gov/bank/individual/failed/", + "maintainer": "FDIC Public Data Feedback", + "maintainer_email": "FDICPublicDataFeedback@fdic.gov", + "notes": "The FDIC is often appointed as receiver for failed banks. This list includes banks which have failed since October 1, 2000.", + "title": "FDIC Failed Bank List", + "resources": [ + {"url": "https://www.fdic.gov/bank/individual/failed/banklist.csv"}, + {"url": "https://www.fdic.gov/bank/individual/failed/index.html"}, + ], + "tags": [ + {"name": "financial-institution"}, + {"name": "banks"}, + {"name": "failures"}, + {"name": "assistance-transactions"}, + ], + "extras": [ + { + "key": "publisher_hierarchy", + "value": "U.S. Government > Federal Deposit Insurance Corporation > Division of Insurance and Research", + }, + {"key": "resource-type", "value": "Dataset"}, + {"key": "publisher", "value": "Division of Insurance and Research"}, + {"key": "accessLevel", "value": "public"}, + {"key": "bureauCode", "value": "357:20"}, + { + "key": "identifier", + "value": "https://www.fdic.gov/bank/individual/failed/", + }, + {"key": "modified", "value": "R/P1W"}, + {"key": "programCode", "value": "000:000"}, + {"key": "publisher", "value": "Division of Insurance and Research"}, + { + "key": "publisher_hierarchy", + "value": "U.S. Government > Federal Deposit Insurance Corporation > Division of Insurance and Research", + }, + {"key": "resource-type", "value": "Dataset"}, + {"key": "publisher", "value": "Division of Insurance and Research"}, + { + "key": "dcat_metadata", + "value": "{'accessLevel': 'public', 'bureauCode': ['357:20'], 'contactPoint': {'fn': 'FDIC Public Data Feedback', 'hasEmail': 'mailto:FDICPublicDataFeedback@fdic.gov'}, 'description': 'The FDIC is often appointed as receiver for failed banks. This list includes banks which have failed since October 1, 2000.', 'distribution': [{'accessURL': 'https://www.fdic.gov/bank/individual/failed/index.html', 'mediaType': 'text/html'}, {'downloadURL': 'https://www.fdic.gov/bank/individual/failed/banklist.csv', 'mediaType': 'text/csv'}], 'identifier': 'https://www.fdic.gov/bank/individual/failed/', 'keyword': ['assistance transactions', 'banks', 'failures', 'financial institution'], 'modified': 'R/P1W', 'programCode': ['000:000'], 'publisher': {'name': 'Division of Insurance and Research', 'subOrganizationOf': {'name': 'Federal Deposit Insurance Corporation', 'subOrganizationOf': {'name': 'U.S. Government'}}}, 'title': 'FDIC Failed Bank List'}", + }, + {"key": "harvest_source_name", "value": "example_harvest_source_name"}, + ], + "author": None, + "author_email": None, + } + + # res = dcatus_to_ckan(test_dcatus_catalog, "example_harvest_source_name") + + assert ( + DeepDiff( + dcatus_to_ckan(test_dcatus_catalog, "example_harvest_source_name"), + expected_result, + ) + == {} + )