-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
283 additions
and
217 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,5 +15,5 @@ poetry install | |
Build gazetteer | ||
|
||
```shell | ||
poetry run python -B build.py | ||
poetry run python -B src/main.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import logging | ||
import os | ||
|
||
from geopandas import GeoDataFrame, points_from_xy | ||
import pandas | ||
|
||
from config import CRS, DATA_PATH | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
GAZETTEER_FILEPATH = os.path.join(DATA_PATH, "gazetteer.json.zip") | ||
|
||
|
||
def field_to_frame(field: pandas.Series) -> pandas.DataFrame: | ||
return pandas.DataFrame(field.fillna("tempvalue").str.split(",").tolist()) | ||
|
||
|
||
def explode_region_names(regions: GeoDataFrame) -> pandas.DataFrame: | ||
fields = ["VARNAME_1", "VARNAME_2", "VARNAME_3", "VARNAME_4"] | ||
return pandas.concat( | ||
[ | ||
field_to_frame(regions[field]).add_prefix(f"REGION_{field}_ALT") | ||
for field in fields | ||
], | ||
axis=1, | ||
).replace("tempvalue", None) | ||
|
||
|
||
def build_region_gazetteer(regions: GeoDataFrame) -> GeoDataFrame: | ||
logger.debug("building regions gazetteer") | ||
return GeoDataFrame( | ||
pandas.concat( | ||
[ | ||
regions.loc[:, ["UID", "NAME_0", "GID_0"]], | ||
regions.loc[ | ||
:, | ||
[ | ||
"NAME_1", | ||
"NL_NAME_1", | ||
"NAME_2", | ||
"NL_NAME_2", | ||
"NAME_3", | ||
"NL_NAME_3", | ||
"NAME_4", | ||
"NAME_5", | ||
], | ||
].add_prefix("REGION_"), | ||
explode_region_names(regions), | ||
], | ||
axis=1, | ||
), | ||
crs=CRS, | ||
geometry=regions.geometry, | ||
) | ||
|
||
|
||
def build_place_gazetteer(places: pandas.DataFrame) -> GeoDataFrame: | ||
logger.debug("building places gazetteer") | ||
features = ["latitude", "longitude", "city_name", "city_asciiname"] | ||
return GeoDataFrame( | ||
pandas.concat( | ||
[ | ||
places.loc[:, features], | ||
field_to_frame(places.city_alternatenames) | ||
.replace("tempvalue", None) | ||
.add_prefix("city_altname"), | ||
], | ||
axis=1, | ||
), | ||
crs=CRS, | ||
geometry=points_from_xy(places.longitude, places.latitude), | ||
) | ||
|
||
|
||
def join_places_in_region(places: GeoDataFrame, regions: GeoDataFrame): | ||
logger.debug("executing point-in-polygon spatial join") | ||
joined = places.sjoin(regions, how="left", predicate="within") | ||
missing = joined.index_right.isna() | ||
if missing.any(): | ||
joined.loc[missing, :] = ( | ||
joined.loc[missing, :] | ||
.drop(columns="index_right") | ||
.sjoin_nearest(regions, how="left") | ||
) | ||
return joined.drop(columns="index_right").reset_index(drop=True) | ||
|
||
|
||
def build_gazetteer(geonames: pandas.DataFrame, gadm: GeoDataFrame) -> pandas.DataFrame: | ||
logger.info("building gazetteer") | ||
gazetteer = join_places_in_region( | ||
build_place_gazetteer(geonames), | ||
build_region_gazetteer(gadm), | ||
) | ||
gazetteer.columns = gazetteer.columns.str.lower() | ||
return gazetteer.drop(columns="geometry") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import logging.config | ||
import os | ||
import warnings | ||
|
||
import geopandas | ||
import pyogrio | ||
|
||
|
||
# remove user warnings | ||
warnings.filterwarnings("ignore") | ||
|
||
# set default map projections | ||
CRS = "EPSG:4326" | ||
|
||
# set default shapes IO engine | ||
geopandas.options.io_engine = "pyogrio" | ||
# do not preprocess polygons to save time | ||
pyogrio.set_gdal_config_options({"OGR_ORGANIZE_POLYGONS": "SKIP"}) | ||
|
||
|
||
DATA_PATH = os.path.join(os.path.dirname(__file__), "data") | ||
os.makedirs(DATA_PATH, exist_ok=True) | ||
|
||
|
||
LOGGING_CONFIG = { | ||
"version": 1, | ||
"disable_existing_loggers": False, | ||
"formatters": { | ||
"default": { | ||
"format": "%(asctime)s %(levelname)-8s %(name)s.%(funcName)s %(message)s" | ||
}, | ||
}, | ||
"handlers": { | ||
"consolehandler": { | ||
"level": "DEBUG", | ||
"formatter": "default", | ||
"class": "logging.StreamHandler", | ||
"stream": "ext://sys.stdout", | ||
}, | ||
}, | ||
"loggers": { | ||
"gazetteer": {"handlers": ["consolehandler"], "level": "INFO", "propagate": False}, | ||
"pyproj": {"level": "WARNING"} | ||
}, | ||
"root": { | ||
"level": "DEBUG", | ||
"handlers": ["consolehandler"] | ||
} | ||
} | ||
logging.config.dictConfig(LOGGING_CONFIG) |
Oops, something went wrong.