diff --git a/src/fit_changedetector/cli.py b/src/fit_changedetector/cli.py index 10d42c4..738df73 100644 --- a/src/fit_changedetector/cli.py +++ b/src/fit_changedetector/cli.py @@ -129,15 +129,12 @@ def process( """For each configured layer - download latest, detect changes, write to file""" configure_logging((verbose - quiet)) - with open(config_file, "r") as f: - config = json.load(f) - # parse config, returning a list of dicts defining layers to process - sources = fcd.parse_config(config) + sources = fcd.parse_config(config_file) # if specified, download only specified layer if layer: - config = [s for s in config if s["layer"] == layer] + config = [s for s in sources if s["layer"] == layer] # alert if no sources match this schedule if len(sources) == 0: LOG.warning(f"No layer named {layer} found in {config}") diff --git a/src/fit_changedetector/download.py b/src/fit_changedetector/download.py index 7dff0f0..2d3a19d 100644 --- a/src/fit_changedetector/download.py +++ b/src/fit_changedetector/download.py @@ -10,14 +10,31 @@ from geopandas import GeoDataFrame import jsonschema from pyproj import CRS +from shapely.geometry.linestring import LineString +from shapely.geometry.multilinestring import MultiLineString +from shapely.geometry.multipoint import MultiPoint +from shapely.geometry.multipolygon import MultiPolygon +from shapely.geometry.point import Point +from shapely.geometry.polygon import Polygon LOG = logging.getLogger(__name__) -def parse_config(config): - """validate and parse config list of dicts""" +def parse_config(config_file): + """validate and parse supplied config file""" + # read config if a test string is provided + if type(config_file) is str: + with open(config_file, "r") as f: + config = json.load(f) + # for testing, accept json dict + elif type(config_file) is list: + config = config_file + else: + raise ValueError( + "config_file must be a path to a file or a list of dicts (sources)" + ) # validate sources against schema doc with open("source_schema.json", "r") as f: schema = json.load(f) @@ -42,6 +59,50 @@ def parse_config(config): return parsed +def standardize_spatial_types(df): + """ + introspect spatial types + - fail if multiple dimensions are found (ie point and poly) + - promote to multipart if any multipart feature is found + (drivers like .gdb do not support mixed-types) + """ + types = set([t.upper() for t in df.geometry.geom_type.unique()]) + # geopandas does not seem to have a st_dimension function, + # inspect the types with string comparison + valid_types = set( + [ + "POINT", + "LINESTRING", + "POLYGON", + "MULTIPOINT", + "MULTILINESTRING", + "MULTIPOLYGON", + ] + ) + if types.difference(valid_types): + raise ValueError( + f"Geometries of type {types.difference(valid_types)} are not supported" + ) + # fail for now but maybe better would be to warn and remove all rows having this type? + # df = df[[df["geom"].geom_type != t]] + # promote geometries to multipart if any multipart features are found + if set(types).intersection(set(("MULTIPOINT", "MULTILINESTRING", "MULTIPOLYGON"))): + LOG.info("Promoting all features to multipart") + df["geom"] = [ + MultiPoint([feature]) if isinstance(feature, Point) else feature + for feature in df["geom"] + ] + df["geom"] = [ + MultiLineString([feature]) if isinstance(feature, LineString) else feature + for feature in df["geom"] + ] + df["geom"] = [ + MultiPolygon([feature]) if isinstance(feature, Polygon) else feature + for feature in df["geom"] + ] + return df + + def download(source): """ Download data, do some simple validation and standardization @@ -110,7 +171,11 @@ def download(source): # retain only columns noted in config and geom df = df[list(cleaned_column_map.values()) + ["geom"]] + # check and fix spatial types + df = standardize_spatial_types(df) + # if primary key(s) provided, sort data by key(s) + pks = None if source["primary_key"]: pks = [cleaned_column_map[k] for k in source["primary_key"]] df = df.sort_values(pks) diff --git a/tests/test_download.py b/tests/test_download.py index 55eed7c..8b310a9 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -121,7 +121,7 @@ def test_invalid_schedule(test_config_file): fcd.parse_config(sources) -def test_clean_columns(test_config_file): +def test_clean_columns(): sources = [ { "out_layer": "parks", @@ -147,7 +147,7 @@ def test_clean_columns(test_config_file): assert "airport_name_" in df.columns -def test_hash_pk(test_config_file): +def test_hash_pk(): sources = [ { "out_layer": "parks", @@ -171,3 +171,20 @@ def test_hash_pk(test_config_file): source = fcd.parse_config(sources)[0] df = fcd.download(source) assert df["fcd_load_id"].iloc[0] == "51eac6b471a28" + + +def test_mixed_types(): + sources = [ + { + "out_layer": "parks", + "source": "tests/data/mixed_types.geojson", + "protocol": "http", + "fields": [ + "SOURCE_DATA_ID", + ], + "schedule": "Q", + } + ] + source = fcd.parse_config(sources)[0] + df = fcd.download(source) + assert [t.upper() for t in df["geom"].geom_type.unique()] == ["MULTIPOINT"]