Skip to content

Commit

Permalink
handle mixed geometry type responses, tweak parse_config func to acce…
Browse files Browse the repository at this point in the history
…pt file path
  • Loading branch information
smnorris committed Sep 6, 2024
1 parent f88ed7e commit 0edccbe
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 9 deletions.
7 changes: 2 additions & 5 deletions src/fit_changedetector/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,12 @@ def process(
"""For each configured layer - download latest, detect changes, write to file"""
configure_logging((verbose - quiet))

with open(config_file, "r") as f:
config = json.load(f)

# parse config, returning a list of dicts defining layers to process
sources = fcd.parse_config(config)
sources = fcd.parse_config(config_file)

# if specified, download only specified layer
if layer:
config = [s for s in config if s["layer"] == layer]
config = [s for s in sources if s["layer"] == layer]
# alert if no sources match this schedule
if len(sources) == 0:
LOG.warning(f"No layer named {layer} found in {config}")
Expand Down
69 changes: 67 additions & 2 deletions src/fit_changedetector/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,31 @@
from geopandas import GeoDataFrame
import jsonschema
from pyproj import CRS
from shapely.geometry.linestring import LineString
from shapely.geometry.multilinestring import MultiLineString
from shapely.geometry.multipoint import MultiPoint
from shapely.geometry.multipolygon import MultiPolygon
from shapely.geometry.point import Point
from shapely.geometry.polygon import Polygon


LOG = logging.getLogger(__name__)


def parse_config(config):
"""validate and parse config list of dicts"""
def parse_config(config_file):
"""validate and parse supplied config file"""

# read config if a test string is provided
if type(config_file) is str:
with open(config_file, "r") as f:
config = json.load(f)
# for testing, accept json dict
elif type(config_file) is list:
config = config_file
else:
raise ValueError(
"config_file must be a path to a file or a list of dicts (sources)"
)
# validate sources against schema doc
with open("source_schema.json", "r") as f:
schema = json.load(f)
Expand All @@ -42,6 +59,50 @@ def parse_config(config):
return parsed


def standardize_spatial_types(df):
"""
introspect spatial types
- fail if multiple dimensions are found (ie point and poly)
- promote to multipart if any multipart feature is found
(drivers like .gdb do not support mixed-types)
"""
types = set([t.upper() for t in df.geometry.geom_type.unique()])
# geopandas does not seem to have a st_dimension function,
# inspect the types with string comparison
valid_types = set(
[
"POINT",
"LINESTRING",
"POLYGON",
"MULTIPOINT",
"MULTILINESTRING",
"MULTIPOLYGON",
]
)
if types.difference(valid_types):
raise ValueError(
f"Geometries of type {types.difference(valid_types)} are not supported"
)
# fail for now but maybe better would be to warn and remove all rows having this type?
# df = df[[df["geom"].geom_type != t]]
# promote geometries to multipart if any multipart features are found
if set(types).intersection(set(("MULTIPOINT", "MULTILINESTRING", "MULTIPOLYGON"))):
LOG.info("Promoting all features to multipart")
df["geom"] = [
MultiPoint([feature]) if isinstance(feature, Point) else feature
for feature in df["geom"]
]
df["geom"] = [
MultiLineString([feature]) if isinstance(feature, LineString) else feature
for feature in df["geom"]
]
df["geom"] = [
MultiPolygon([feature]) if isinstance(feature, Polygon) else feature
for feature in df["geom"]
]
return df


def download(source):
"""
Download data, do some simple validation and standardization
Expand Down Expand Up @@ -110,7 +171,11 @@ def download(source):
# retain only columns noted in config and geom
df = df[list(cleaned_column_map.values()) + ["geom"]]

# check and fix spatial types
df = standardize_spatial_types(df)

# if primary key(s) provided, sort data by key(s)
pks = None
if source["primary_key"]:
pks = [cleaned_column_map[k] for k in source["primary_key"]]
df = df.sort_values(pks)
Expand Down
21 changes: 19 additions & 2 deletions tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def test_invalid_schedule(test_config_file):
fcd.parse_config(sources)


def test_clean_columns(test_config_file):
def test_clean_columns():
sources = [
{
"out_layer": "parks",
Expand All @@ -147,7 +147,7 @@ def test_clean_columns(test_config_file):
assert "airport_name_" in df.columns


def test_hash_pk(test_config_file):
def test_hash_pk():
sources = [
{
"out_layer": "parks",
Expand All @@ -171,3 +171,20 @@ def test_hash_pk(test_config_file):
source = fcd.parse_config(sources)[0]
df = fcd.download(source)
assert df["fcd_load_id"].iloc[0] == "51eac6b471a28"


def test_mixed_types():
sources = [
{
"out_layer": "parks",
"source": "tests/data/mixed_types.geojson",
"protocol": "http",
"fields": [
"SOURCE_DATA_ID",
],
"schedule": "Q",
}
]
source = fcd.parse_config(sources)[0]
df = fcd.download(source)
assert [t.upper() for t in df["geom"].geom_type.unique()] == ["MULTIPOINT"]

0 comments on commit 0edccbe

Please sign in to comment.