handle mixed geometry type responses, tweak parse_config func to acce…

…pt file path
bcgov · Sep 6, 2024 · 0edccbe · 0edccbe
1 parent f88ed7e
commit 0edccbe
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 9 deletions.
diff --git a/src/fit_changedetector/cli.py b/src/fit_changedetector/cli.py
@@ -129,15 +129,12 @@ def process(
     """For each configured layer - download latest, detect changes, write to file"""
     configure_logging((verbose - quiet))
 
-    with open(config_file, "r") as f:
-        config = json.load(f)
-
     # parse config, returning a list of dicts defining layers to process
-    sources = fcd.parse_config(config)
+    sources = fcd.parse_config(config_file)
 
     # if specified, download only specified layer
     if layer:
-        config = [s for s in config if s["layer"] == layer]
+        config = [s for s in sources if s["layer"] == layer]
         # alert if no sources match this schedule
         if len(sources) == 0:
             LOG.warning(f"No layer named {layer} found in {config}")

diff --git a/src/fit_changedetector/download.py b/src/fit_changedetector/download.py
@@ -10,14 +10,31 @@
 from geopandas import GeoDataFrame
 import jsonschema
 from pyproj import CRS
+from shapely.geometry.linestring import LineString
+from shapely.geometry.multilinestring import MultiLineString
+from shapely.geometry.multipoint import MultiPoint
+from shapely.geometry.multipolygon import MultiPolygon
+from shapely.geometry.point import Point
+from shapely.geometry.polygon import Polygon
 
 
 LOG = logging.getLogger(__name__)
 
 
-def parse_config(config):
-    """validate and parse config list of dicts"""
+def parse_config(config_file):
+    """validate and parse supplied config file"""
 
+    # read config if a test string is provided
+    if type(config_file) is str:
+        with open(config_file, "r") as f:
+            config = json.load(f)
+    # for testing, accept json dict
+    elif type(config_file) is list:
+        config = config_file
+    else:
+        raise ValueError(
+            "config_file must be a path to a file or a list of dicts (sources)"
+        )
     # validate sources against schema doc
     with open("source_schema.json", "r") as f:
         schema = json.load(f)
@@ -42,6 +59,50 @@ def parse_config(config):
     return parsed
 
 
+def standardize_spatial_types(df):
+    """
+    introspect spatial types
+    - fail if multiple dimensions are found (ie point and poly)
+    - promote to multipart if any multipart feature is found
+    (drivers like .gdb do not support mixed-types)
+    """
+    types = set([t.upper() for t in df.geometry.geom_type.unique()])
+    # geopandas does not seem to have a st_dimension function,
+    # inspect the types with string comparison
+    valid_types = set(
+        [
+            "POINT",
+            "LINESTRING",
+            "POLYGON",
+            "MULTIPOINT",
+            "MULTILINESTRING",
+            "MULTIPOLYGON",
+        ]
+    )
+    if types.difference(valid_types):
+        raise ValueError(
+            f"Geometries of type {types.difference(valid_types)} are not supported"
+        )
+        # fail for now but maybe better would be to warn and remove all rows having this type?
+        # df = df[[df["geom"].geom_type != t]]
+    # promote geometries to multipart if any multipart features are found
+    if set(types).intersection(set(("MULTIPOINT", "MULTILINESTRING", "MULTIPOLYGON"))):
+        LOG.info("Promoting all features to multipart")
+        df["geom"] = [
+            MultiPoint([feature]) if isinstance(feature, Point) else feature
+            for feature in df["geom"]
+        ]
+        df["geom"] = [
+            MultiLineString([feature]) if isinstance(feature, LineString) else feature
+            for feature in df["geom"]
+        ]
+        df["geom"] = [
+            MultiPolygon([feature]) if isinstance(feature, Polygon) else feature
+            for feature in df["geom"]
+        ]
+    return df
+
+
 def download(source):
     """
     Download data, do some simple validation and standardization
@@ -110,7 +171,11 @@ def download(source):
     # retain only columns noted in config and geom
     df = df[list(cleaned_column_map.values()) + ["geom"]]
 
+    # check and fix spatial types
+    df = standardize_spatial_types(df)
+
     # if primary key(s) provided, sort data by key(s)
+    pks = None
     if source["primary_key"]:
         pks = [cleaned_column_map[k] for k in source["primary_key"]]
         df = df.sort_values(pks)

diff --git a/tests/test_download.py b/tests/test_download.py
@@ -121,7 +121,7 @@ def test_invalid_schedule(test_config_file):
         fcd.parse_config(sources)
 
 
-def test_clean_columns(test_config_file):
+def test_clean_columns():
     sources = [
         {
             "out_layer": "parks",
@@ -147,7 +147,7 @@ def test_clean_columns(test_config_file):
     assert "airport_name_" in df.columns
 
 
-def test_hash_pk(test_config_file):
+def test_hash_pk():
     sources = [
         {
             "out_layer": "parks",
@@ -171,3 +171,20 @@ def test_hash_pk(test_config_file):
     source = fcd.parse_config(sources)[0]
     df = fcd.download(source)
     assert df["fcd_load_id"].iloc[0] == "51eac6b471a28"
+
+
+def test_mixed_types():
+    sources = [
+        {
+            "out_layer": "parks",
+            "source": "tests/data/mixed_types.geojson",
+            "protocol": "http",
+            "fields": [
+                "SOURCE_DATA_ID",
+            ],
+            "schedule": "Q",
+        }
+    ]
+    source = fcd.parse_config(sources)[0]
+    df = fcd.download(source)
+    assert [t.upper() for t in df["geom"].geom_type.unique()] == ["MULTIPOINT"]