From 59c118a9f1ab93285596aa427e42ec99c553d9af Mon Sep 17 00:00:00 2001
From: Simon Norris <snorris@hillcrestgeo.ca>
Date: Thu, 15 Aug 2024 16:41:55 -0700
Subject: [PATCH] fix #193 and simplify get_table_definition function

---
 bcdata/__init__.py  |  16 ++++++-
 bcdata/bc2pg.py     |  18 +-------
 bcdata/bcdc.py      | 104 +++++++++++++++-----------------------------
 bcdata/cli.py       |   7 +--
 bcdata/wfs.py       |   5 +--
 tests/test_bc2pg.py |   3 +-
 6 files changed, 56 insertions(+), 97 deletions(-)
diff --git a/bcdata/__init__.py b/bcdata/__init__.py
index d9017d8..84aa727 100644
--- a/bcdata/__init__.py
+++ b/bcdata/__init__.py
@@ -1,4 +1,6 @@
-from .bc2pg import bc2pg, get_primary_keys
+import requests
+
+from .bc2pg import bc2pg
 from .bcdc import get_table_definition, get_table_name
 from .wcs import get_dem
 from .wfs import (
@@ -15,4 +17,14 @@
     "https://raw.githubusercontent.com/smnorris/bcdata/main/data/primary_keys.json"
 )
 
-__version__ = "0.11.1dev0"
+# BCDC does not indicate which column in the schema is the primary key.
+# In this absence, bcdata maintains its own dictionary of {table: primary_key},
+# served via github. Retrieve the dict with this function"""
+response = requests.get(PRIMARY_KEY_DB_URL)
+if response.status_code == 200:
+    primary_keys = response.json()
+else:
+    raise Exception(f"Failed to download primary key database at {PRIMARY_KEY_DB_URL}")
+    primary_keys = {}
+
+__version__ = "0.12.0dev0"
diff --git a/bcdata/bc2pg.py b/bcdata/bc2pg.py
index f41289c..df4f1e1 100644
--- a/bcdata/bc2pg.py
+++ b/bcdata/bc2pg.py
@@ -34,19 +34,6 @@
 ]
 
 
-def get_primary_keys():
-    """download primary key data file"""
-    response = requests.get(bcdata.PRIMARY_KEY_DB_URL)
-    if response.status_code == 200:
-        primary_keys = response.json()
-    else:
-        log.warning(
-            f"Failed to download primary key database at {bcdata.PRIMARY_KEY_DB_URL}"
-        )
-        primary_keys = {}
-    return primary_keys
-
-
 def bc2pg(  # noqa: C901
     dataset,
     db_url,
@@ -148,9 +135,8 @@ def bc2pg(  # noqa: C901
             raise ValueError("Geometry type {geometry_type} is not supported")
 
         # if primary key is not supplied, use default (if present in list)
-        primary_keys = get_primary_keys()
-        if not primary_key and dataset.lower() in primary_keys:
-            primary_key = primary_keys[dataset.lower()]
+        if not primary_key and dataset.lower() in bcdata.primary_keys:
+            primary_key = bcdata.primary_keys[dataset.lower()]
 
         # fail if specified primary key is not in the table
         if primary_key and primary_key.upper() not in [
diff --git a/bcdata/bcdc.py b/bcdata/bcdc.py
index 13f549d..c40bbff 100644
--- a/bcdata/bcdc.py
+++ b/bcdata/bcdc.py
@@ -36,7 +36,10 @@ def _package_show(package):
 
 @stamina.retry(on=requests.HTTPError, timeout=60)
 def _table_definition(table_name):
-    r = requests.get(BCDC_API_URL + "package_search", params={"q": table_name})
+    r = requests.get(
+        BCDC_API_URL + "package_search",
+        params={"q": "res_extras_object_name:" + table_name},
+    )
     if r.status_code != 200:
         log.warning(r.headers)
     if r.status_code in [400, 401, 404]:
@@ -66,7 +69,7 @@ def get_table_name(package):
     return layer_names[0]
 
 
-def get_table_definition(table_name):  # noqa: C901
+def get_table_definition(table_name):
     """
     Given a table/object name, search BCDC for the first package/resource with a matching "object_name",
     returns dict: {"comments": <>, "notes": <>, "schema": {<schema dict>} }
@@ -77,81 +80,46 @@ def get_table_definition(table_name):  # noqa: C901
         raise ValueError(
             f"Only tables available via WFS are supported, {table_name} not found"
         )
+
     # search the api for the provided table
     r = _table_definition(table_name)
+
+    # start with an empty table definition dict
+    table_definition = {
+        "description": None,
+        "comments": None,
+        "schema": [],
+        "primary_key": None,
+    }
+
     # if there are no matching results, let the user know
     if r.json()["result"]["count"] == 0:
         log.warning(
             f"BC Data Catalouge API search provides no results for: {table_name}"
         )
-        return []
     else:
-        matches = []
         # iterate through results of search (packages)
         for result in r.json()["result"]["results"]:
-            notes = result["notes"]
+            # description is at top level, same for all resources
+            table_definition["description"] = result["notes"]
             # iterate through resources associated with each package
             for resource in result["resources"]:
-                # where to find schema details depends on format type
-                if resource["format"] == "wms":
-                    if urlparse(resource["url"]).path.split("/")[3] == table_name:
-                        if "object_table_comments" in resource.keys():
-                            table_comments = resource["object_table_comments"]
-                        else:
-                            table_comments = None
-                        # only add to matches if schema details found
-                        if "details" in resource.keys() and resource["details"] != "":
-                            table_details = resource["details"]
-                            matches.append((notes, table_comments, table_details))
-                            log.debug(resource)
-                # oracle sde format type
-                if resource["format"] == "oracle_sde":
-                    if resource["object_name"] == table_name:
-                        if "object_table_comments" in resource.keys():
-                            table_comments = resource["object_table_comments"]
-                        else:
-                            table_comments = None
-                        # only add to matches if schema details found
-                        if "details" in resource.keys() and resource["details"] != "":
-                            table_details = resource["details"]
-                            matches.append((notes, table_comments, table_details))
-                            log.debug(resource)
-
-                # multiple format resource
-                elif resource["format"] == "multiple":
-                    # if multiple format, check for table name match in this location
-                    if resource["preview_info"]:
-                        # check that layer_name key is present
-                        if "layer_name" in json.loads(resource["preview_info"]):
-                            # then check if it matches the table name
-                            if (
-                                json.loads(resource["preview_info"])["layer_name"]
-                                == table_name
-                            ):
-                                if "object_table_comments" in resource.keys():
-                                    table_comments = resource["object_table_comments"]
-                                else:
-                                    table_comments = None
-                                # only add to matches if schema details found
-                                if (
-                                    "details" in resource.keys()
-                                    and resource["details"] != ""
-                                ):
-                                    table_details = resource["details"]
-                                    matches.append(
-                                        (notes, table_comments, table_details)
-                                    )
-                                    log.debug(resource)
-
-        # uniquify the result
-        if len(matches) > 0:
-            matched = list(set(matches))[0]
-            return {
-                "description": matched[0],  # notes=description
-                "comments": matched[1],
-                "schema": json.loads(matched[2]),
-            }
-        else:
-            raise ValueError(
-                f"BCDC search for {table_name} does not return a table schema"
-            )
+                # presume description and details are the same for all resources
+                # (below only retains the final schema/comments if there is more than one
+                # package with this information)
+                if "details" in resource.keys() and resource["details"] != "":
+                    table_definition["schema"] = json.loads(resource["details"])
+                    # look for comments only if details/schema is present
+                    if "object_table_comments" in resource.keys():
+                        table_definition["comments"] = resource["object_table_comments"]
+
+    if not table_definition["schema"]:
+        raise log.warning(
+            f"BC Data Catalouge API search provides no schema for: {table_name}"
+        )
+
+    # add primary key if present in bcdata.primary_keys
+    if table_name.lower() in bcdata.primary_keys:
+        table_definition["primary_key"] = bcdata.primary_keys[table_name.lower()]
+
+    return table_definition
diff --git a/bcdata/cli.py b/bcdata/cli.py
index d8cc7ac..073e66f 100644
--- a/bcdata/cli.py
+++ b/bcdata/cli.py
@@ -131,14 +131,9 @@ def info(dataset, indent, meta_member, verbose, quiet):
     verbosity = verbose - quiet
     configure_logging(verbosity)
     dataset = bcdata.validate_name(dataset)
-    info = {}
+    info = bcdata.get_table_definition(dataset)
     info["name"] = dataset
     info["count"] = bcdata.get_count(dataset)
-    table_definition = bcdata.get_table_definition(dataset)
-    info["description"] = table_definition["description"]
-    info["table_comments"] = table_definition["comments"]
-    info["schema"] = table_definition["schema"]
-
     if meta_member:
         click.echo(info[meta_member])
     else:
diff --git a/bcdata/wfs.py b/bcdata/wfs.py
index bbfd5d5..6c63662 100644
--- a/bcdata/wfs.py
+++ b/bcdata/wfs.py
@@ -223,9 +223,8 @@ def get_sortkey(self, table):
         """Check data for unique columns available for sorting paged requests"""
         columns = list(self.get_schema(table)["properties"].keys())
         # use known primary key if it is present in the bcdata repository
-        known_primary_keys = bcdata.get_primary_keys()
-        if table.lower() in known_primary_keys:
-            return known_primary_keys[table.lower()].upper()
+        if table.lower() in bcdata.primary_keys:
+            return bcdata.primary_keys[table.lower()].upper()
         # if pk not known, use OBJECTID as default sort key when present
         elif "OBJECTID" in columns:
             return "OBJECTID"
diff --git a/tests/test_bc2pg.py b/tests/test_bc2pg.py
index ce3623b..b33c693 100644
--- a/tests/test_bc2pg.py
+++ b/tests/test_bc2pg.py
@@ -126,8 +126,7 @@ def test_bc2pg_primary_key():
 
 
 def test_bc2pg_get_primary_keys():
-    primary_keys = bcdata.get_primary_keys()
-    assert primary_keys[ASSESSMENTS_TABLE] == "stream_crossing_id"
+    assert bcdata.primary_keys[ASSESSMENTS_TABLE] == "stream_crossing_id"
 
 
 def test_bc2pg_primary_key_default():