From 59c118a9f1ab93285596aa427e42ec99c553d9af Mon Sep 17 00:00:00 2001 From: Simon Norris Date: Thu, 15 Aug 2024 16:41:55 -0700 Subject: [PATCH] fix #193 and simplify get_table_definition function --- bcdata/__init__.py | 16 ++++++- bcdata/bc2pg.py | 18 +------- bcdata/bcdc.py | 104 +++++++++++++++----------------------------- bcdata/cli.py | 7 +-- bcdata/wfs.py | 5 +-- tests/test_bc2pg.py | 3 +- 6 files changed, 56 insertions(+), 97 deletions(-) diff --git a/bcdata/__init__.py b/bcdata/__init__.py index d9017d8..84aa727 100644 --- a/bcdata/__init__.py +++ b/bcdata/__init__.py @@ -1,4 +1,6 @@ -from .bc2pg import bc2pg, get_primary_keys +import requests + +from .bc2pg import bc2pg from .bcdc import get_table_definition, get_table_name from .wcs import get_dem from .wfs import ( @@ -15,4 +17,14 @@ "https://raw.githubusercontent.com/smnorris/bcdata/main/data/primary_keys.json" ) -__version__ = "0.11.1dev0" +# BCDC does not indicate which column in the schema is the primary key. +# In this absence, bcdata maintains its own dictionary of {table: primary_key}, +# served via github. Retrieve the dict with this function""" +response = requests.get(PRIMARY_KEY_DB_URL) +if response.status_code == 200: + primary_keys = response.json() +else: + raise Exception(f"Failed to download primary key database at {PRIMARY_KEY_DB_URL}") + primary_keys = {} + +__version__ = "0.12.0dev0" diff --git a/bcdata/bc2pg.py b/bcdata/bc2pg.py index f41289c..df4f1e1 100644 --- a/bcdata/bc2pg.py +++ b/bcdata/bc2pg.py @@ -34,19 +34,6 @@ ] -def get_primary_keys(): - """download primary key data file""" - response = requests.get(bcdata.PRIMARY_KEY_DB_URL) - if response.status_code == 200: - primary_keys = response.json() - else: - log.warning( - f"Failed to download primary key database at {bcdata.PRIMARY_KEY_DB_URL}" - ) - primary_keys = {} - return primary_keys - - def bc2pg( # noqa: C901 dataset, db_url, @@ -148,9 +135,8 @@ def bc2pg( # noqa: C901 raise ValueError("Geometry type {geometry_type} is not supported") # if primary key is not supplied, use default (if present in list) - primary_keys = get_primary_keys() - if not primary_key and dataset.lower() in primary_keys: - primary_key = primary_keys[dataset.lower()] + if not primary_key and dataset.lower() in bcdata.primary_keys: + primary_key = bcdata.primary_keys[dataset.lower()] # fail if specified primary key is not in the table if primary_key and primary_key.upper() not in [ diff --git a/bcdata/bcdc.py b/bcdata/bcdc.py index 13f549d..c40bbff 100644 --- a/bcdata/bcdc.py +++ b/bcdata/bcdc.py @@ -36,7 +36,10 @@ def _package_show(package): @stamina.retry(on=requests.HTTPError, timeout=60) def _table_definition(table_name): - r = requests.get(BCDC_API_URL + "package_search", params={"q": table_name}) + r = requests.get( + BCDC_API_URL + "package_search", + params={"q": "res_extras_object_name:" + table_name}, + ) if r.status_code != 200: log.warning(r.headers) if r.status_code in [400, 401, 404]: @@ -66,7 +69,7 @@ def get_table_name(package): return layer_names[0] -def get_table_definition(table_name): # noqa: C901 +def get_table_definition(table_name): """ Given a table/object name, search BCDC for the first package/resource with a matching "object_name", returns dict: {"comments": <>, "notes": <>, "schema": {} } @@ -77,81 +80,46 @@ def get_table_definition(table_name): # noqa: C901 raise ValueError( f"Only tables available via WFS are supported, {table_name} not found" ) + # search the api for the provided table r = _table_definition(table_name) + + # start with an empty table definition dict + table_definition = { + "description": None, + "comments": None, + "schema": [], + "primary_key": None, + } + # if there are no matching results, let the user know if r.json()["result"]["count"] == 0: log.warning( f"BC Data Catalouge API search provides no results for: {table_name}" ) - return [] else: - matches = [] # iterate through results of search (packages) for result in r.json()["result"]["results"]: - notes = result["notes"] + # description is at top level, same for all resources + table_definition["description"] = result["notes"] # iterate through resources associated with each package for resource in result["resources"]: - # where to find schema details depends on format type - if resource["format"] == "wms": - if urlparse(resource["url"]).path.split("/")[3] == table_name: - if "object_table_comments" in resource.keys(): - table_comments = resource["object_table_comments"] - else: - table_comments = None - # only add to matches if schema details found - if "details" in resource.keys() and resource["details"] != "": - table_details = resource["details"] - matches.append((notes, table_comments, table_details)) - log.debug(resource) - # oracle sde format type - if resource["format"] == "oracle_sde": - if resource["object_name"] == table_name: - if "object_table_comments" in resource.keys(): - table_comments = resource["object_table_comments"] - else: - table_comments = None - # only add to matches if schema details found - if "details" in resource.keys() and resource["details"] != "": - table_details = resource["details"] - matches.append((notes, table_comments, table_details)) - log.debug(resource) - - # multiple format resource - elif resource["format"] == "multiple": - # if multiple format, check for table name match in this location - if resource["preview_info"]: - # check that layer_name key is present - if "layer_name" in json.loads(resource["preview_info"]): - # then check if it matches the table name - if ( - json.loads(resource["preview_info"])["layer_name"] - == table_name - ): - if "object_table_comments" in resource.keys(): - table_comments = resource["object_table_comments"] - else: - table_comments = None - # only add to matches if schema details found - if ( - "details" in resource.keys() - and resource["details"] != "" - ): - table_details = resource["details"] - matches.append( - (notes, table_comments, table_details) - ) - log.debug(resource) - - # uniquify the result - if len(matches) > 0: - matched = list(set(matches))[0] - return { - "description": matched[0], # notes=description - "comments": matched[1], - "schema": json.loads(matched[2]), - } - else: - raise ValueError( - f"BCDC search for {table_name} does not return a table schema" - ) + # presume description and details are the same for all resources + # (below only retains the final schema/comments if there is more than one + # package with this information) + if "details" in resource.keys() and resource["details"] != "": + table_definition["schema"] = json.loads(resource["details"]) + # look for comments only if details/schema is present + if "object_table_comments" in resource.keys(): + table_definition["comments"] = resource["object_table_comments"] + + if not table_definition["schema"]: + raise log.warning( + f"BC Data Catalouge API search provides no schema for: {table_name}" + ) + + # add primary key if present in bcdata.primary_keys + if table_name.lower() in bcdata.primary_keys: + table_definition["primary_key"] = bcdata.primary_keys[table_name.lower()] + + return table_definition diff --git a/bcdata/cli.py b/bcdata/cli.py index d8cc7ac..073e66f 100644 --- a/bcdata/cli.py +++ b/bcdata/cli.py @@ -131,14 +131,9 @@ def info(dataset, indent, meta_member, verbose, quiet): verbosity = verbose - quiet configure_logging(verbosity) dataset = bcdata.validate_name(dataset) - info = {} + info = bcdata.get_table_definition(dataset) info["name"] = dataset info["count"] = bcdata.get_count(dataset) - table_definition = bcdata.get_table_definition(dataset) - info["description"] = table_definition["description"] - info["table_comments"] = table_definition["comments"] - info["schema"] = table_definition["schema"] - if meta_member: click.echo(info[meta_member]) else: diff --git a/bcdata/wfs.py b/bcdata/wfs.py index bbfd5d5..6c63662 100644 --- a/bcdata/wfs.py +++ b/bcdata/wfs.py @@ -223,9 +223,8 @@ def get_sortkey(self, table): """Check data for unique columns available for sorting paged requests""" columns = list(self.get_schema(table)["properties"].keys()) # use known primary key if it is present in the bcdata repository - known_primary_keys = bcdata.get_primary_keys() - if table.lower() in known_primary_keys: - return known_primary_keys[table.lower()].upper() + if table.lower() in bcdata.primary_keys: + return bcdata.primary_keys[table.lower()].upper() # if pk not known, use OBJECTID as default sort key when present elif "OBJECTID" in columns: return "OBJECTID" diff --git a/tests/test_bc2pg.py b/tests/test_bc2pg.py index ce3623b..b33c693 100644 --- a/tests/test_bc2pg.py +++ b/tests/test_bc2pg.py @@ -126,8 +126,7 @@ def test_bc2pg_primary_key(): def test_bc2pg_get_primary_keys(): - primary_keys = bcdata.get_primary_keys() - assert primary_keys[ASSESSMENTS_TABLE] == "stream_crossing_id" + assert bcdata.primary_keys[ASSESSMENTS_TABLE] == "stream_crossing_id" def test_bc2pg_primary_key_default():