diff --git a/README.md b/README.md index 82256cf..cb7d4b5 100644 --- a/README.md +++ b/README.md @@ -15,5 +15,5 @@ poetry install Build gazetteer ```shell -poetry run python -B build.py +poetry run python -B src/main.py ``` diff --git a/build.py b/build.py deleted file mode 100644 index c347642..0000000 --- a/build.py +++ /dev/null @@ -1,212 +0,0 @@ -import argparse -import logging -import warnings -import os - -from geopandas import GeoDataFrame, points_from_xy -import pandas -import pyogrio -import requests -from tqdm import tqdm - - -# remove user warnings -warnings.filterwarnings("ignore", category=UserWarning) -pyogrio.set_gdal_config_options({"OGR_ORGANIZE_POLYGONS": "SKIP"}) - - -# data path -DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") -os.makedirs(DATA_PATH, exist_ok=True) - - -GEONAMES_CITIES_15K_REMOTE_URL = "http://download.geonames.org/export/dump/cities15000.zip" -GADM_REMOTE_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_410-gdb.zip" -GADM_FILE_NAME = os.path.basename(GADM_REMOTE_URL) -GADM_LOCAL_ARCHIVE_PATH = os.path.join(DATA_PATH, GADM_FILE_NAME) -# allow reading a compressed geodatabase with pyogrio -GADM_LOCAL_PYOGRIO_PATH = ( - f"zip:{GADM_LOCAL_ARCHIVE_PATH}!/{GADM_FILE_NAME.replace('-gdb.zip', '.gdb')}" -) - - -def download_geometries(url: str, output_path: str): - logging.debug(dict(source=url, destination=output_path)) - with requests.get(url, stream=True) as r: - r.raise_for_status() - chunk_size = 1024 - total_bytes = int(r.headers.get("content-length", 0)) - with open(output_path, "wb") as f: - with tqdm( - unit="B", - unit_scale=True, - unit_divisor=chunk_size, - miniters=1, - desc=output_path, - total=total_bytes, - ) as p: - for chunk in r.iter_content(chunk_size=chunk_size * 10): - f.write(chunk) - p.update(len(chunk)) - - -def remove_regions(df: GeoDataFrame) -> GeoDataFrame: - logging.debug("remove unused GADM regions") - to_drop = ["Antarctica", "Caspian Sea"] - return df.loc[~df.NAME_0.isin(to_drop), :].copy() - - -def update_polygon_meta(df: GeoDataFrame) -> GeoDataFrame: - logging.debug("update missing GADM region meta") - # Ukraine - df.loc[ - (df.NAME_1 == "?").idxmax(), ["NAME_1", "NL_NAME_1", "HASC_1", "ENGTYPE_1"] - ] = ["Kiev City", "Київ", "UA.KC", "Independent City"] - return df - - -def fill_missing_region_names(df: GeoDataFrame) -> GeoDataFrame: - logging.debug("fill missing GADM region names with country name") - regionless = df.NAME_1.isna() - df.loc[regionless, "NAME_1"] = df.loc[regionless, "NAME_0"] - return df - - -def rename_region_name_attributes(df: GeoDataFrame) -> GeoDataFrame: - logging.debug("rename wrong GADM region name attributes") - mapping = { - "Apulia": "Puglia", - "Sicily": "Sicilia", - } - df.NAME_1 = df.NAME_1.replace(mapping) - return df - - -def load_regions(filepath: str) -> GeoDataFrame: - logging.info("loading {}".format(filepath)) - return pyogrio.read_dataframe(filepath) - - -def build_regions(gadm: GeoDataFrame) -> GeoDataFrame: - logging.info("building regions") - return ( - gadm - .pipe(remove_regions) # remove inhabitated polygons e.g. Antartica - .pipe(update_polygon_meta) # e.g. Ukraine has missing data - .pipe(fill_missing_region_names) # small countries are regionless - .pipe(rename_region_name_attributes) # rename regions using local knownledge - ) - - -def load_cities(path: str) -> pandas.DataFrame: - """ - Geonames DB @ http://download.geonames.org/export/dump/cities15000.zip - """ - logging.debug("loading geonamnes cities15000 points") - fields = [ - # from cities15000.txt (Geonames) - "city_id", - "city_name", - "city_asciiname", - "city_alternatenames", - "latitude", - "longitude", - "feature_class", - "feature_code", - "country_a2", - "cc2", - "admin1", - "admin2", - "admin3", - "admin4", - "population", - "elevation", - "dem", - "timezone", - "modification_date", - ] - return pandas.read_csv(path, sep="\t", names=fields) - - -def points_in_polygons_lookup(points, polygons) -> pandas.Series: - logging.debug("executing point-in-polygon spatial join") - # exec spatial join and return index_right i.e. the polygon ID containing the point - polygons_geo = polygons[["geometry"]] - # build geometry from latitude/longitude set - points_geo = GeoDataFrame(geometry=points_from_xy(points.longitude, points.latitude), crs=polygons.crs) - # spatial join concatenate features with predicate. index_right is the index of polygons containing the points - spatial_lookup = points_geo.sjoin(polygons_geo, how="left", predicate="within")["index_right"] - # if NaN in data not all points were within a polygon e.g. coastal cities off-shore - not_in_polygon = spatial_lookup.isna() - if not_in_polygon.any(): - # nearest spatial join - spatial_lookup[not_in_polygon] = points_geo[not_in_polygon].sjoin_nearest(polygons_geo, how="left")["index_right"] - return spatial_lookup.astype(int) - - -def build_place_gazetteer(points: pandas.DataFrame) -> pandas.DataFrame: - logging.debug("build place gazetteer") - # extract places - places = points.city_name.rename("place_name") - ascii_places = points.city_asciiname.rename("place_name_ascii") - # explode_alternate_names - alt_places = pandas.DataFrame( - points.city_alternatenames.fillna('tempvalue').str.split(",").tolist() - ).replace('tempvalue', None).add_prefix("place_name_alt") - # build gazetteer - return pandas.concat([points.latitude.astype(float), points.longitude.astype(float), places, ascii_places, alt_places], axis=1) - - -def build_region_gazetteer(polygons: pandas.DataFrame) -> pandas.DataFrame: - logging.debug("build region gazetteer") - prefix = "region_" - # concatenate lower level first to be stacked right to places in next step - varnames = pandas.DataFrame() - for lev in range(4): - col = "VARNAME_{}".format(lev+1) - varnames = pandas.concat([ - varnames, - pandas.DataFrame( - polygons[col].fillna('tempvalue').str.split("|").tolist() - ).add_prefix("".join([prefix, col, "_alt"])) - ], axis=1) - # build region gazetteer - gazetteer = pandas.concat([ - varnames.replace('tempvalue', None), - polygons.loc[:, ["NAME_5","NAME_4","NL_NAME_3","NAME_3","NL_NAME_2","NAME_2","NL_NAME_1","NAME_1"]].add_prefix(prefix), - polygons.NAME_0.rename("country_name"), - polygons.GID_0.rename("country_iso3"), - polygons.UID, - ], axis=1) - # normalize column names - gazetteer.columns = gazetteer.columns.str.lower() - return gazetteer - - -def build_gazetteer(points: pandas.DataFrame, polygons: pandas.DataFrame) -> None: - logging.info("building gazetteer") - places_gazetteer = build_place_gazetteer(points) - regions_gazetteer = build_region_gazetteer(polygons) - places_gazetteer["polygon_index"] = points_in_polygons_lookup(points, polygons) - # stack region names right to follow the place size logic place > region_N > Country - gazetteer = places_gazetteer.join(regions_gazetteer, on="polygon_index").drop(columns=["polygon_index"]) - gazetteer.to_json(os.path.join(DATA_PATH, "gazetteer.json.zip"), orient="records", date_format=None, lines=True, force_ascii=False) - - -def main(): - parser = argparse.ArgumentParser(description="Build gazetteer using GADM and Geonames data") - parser.add_argument("--debug", action="store_true", default=False) - args = parser.parse_args() - # setup root logger level - logging.root.setLevel(logging.DEBUG if args.debug else logging.INFO) - if not os.path.exists(GADM_LOCAL_ARCHIVE_PATH): - download_geometries(GADM_REMOTE_URL, GADM_LOCAL_ARCHIVE_PATH) - geonames_cities = load_cities(GEONAMES_CITIES_15K_REMOTE_URL) - gadm_regions = build_regions(load_regions(GADM_LOCAL_PYOGRIO_PATH)) - # build gazetteer - build_gazetteer(geonames_cities, gadm_regions) - - -if __name__ == "__main__": - main() - diff --git a/pyproject.toml b/pyproject.toml index fd93b72..fcf2c12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,18 +1,15 @@ [tool.poetry] name = "gazetteer" -version = "0.2.0" +version = "0.3.0" description = "Gazetteer from GADM regions and Geonames cities 15K" authors = ["Emanuele Panizio "] readme = "README.md" [tool.poetry.dependencies] python = "^3.10" -requests = "^2.31.0" geopandas = "^0.14.3" pandas = "^2.2.1" pyogrio = "^0.7.2" -Rtree = "^1.2.0" -tqdm = "^4.66.2" [tool.poetry.group.dev.dependencies] ipykernel = "^6.29.4" diff --git a/src/build.py b/src/build.py new file mode 100644 index 0000000..321201e --- /dev/null +++ b/src/build.py @@ -0,0 +1,97 @@ +import logging +import os + +from geopandas import GeoDataFrame, points_from_xy +import pandas + +from config import CRS, DATA_PATH + + +logger = logging.getLogger(__name__) + + +GAZETTEER_FILEPATH = os.path.join(DATA_PATH, "gazetteer.json.zip") + + +def field_to_frame(field: pandas.Series) -> pandas.DataFrame: + return pandas.DataFrame(field.fillna("tempvalue").str.split(",").tolist()) + + +def explode_region_names(regions: GeoDataFrame) -> pandas.DataFrame: + fields = ["VARNAME_1", "VARNAME_2", "VARNAME_3", "VARNAME_4"] + return pandas.concat( + [ + field_to_frame(regions[field]).add_prefix(f"REGION_{field}_ALT") + for field in fields + ], + axis=1, + ).replace("tempvalue", None) + + +def build_region_gazetteer(regions: GeoDataFrame) -> GeoDataFrame: + logger.debug("building regions gazetteer") + return GeoDataFrame( + pandas.concat( + [ + regions.loc[:, ["UID", "NAME_0", "GID_0"]], + regions.loc[ + :, + [ + "NAME_1", + "NL_NAME_1", + "NAME_2", + "NL_NAME_2", + "NAME_3", + "NL_NAME_3", + "NAME_4", + "NAME_5", + ], + ].add_prefix("REGION_"), + explode_region_names(regions), + ], + axis=1, + ), + crs=CRS, + geometry=regions.geometry, + ) + + +def build_place_gazetteer(places: pandas.DataFrame) -> GeoDataFrame: + logger.debug("building places gazetteer") + features = ["latitude", "longitude", "city_name", "city_asciiname"] + return GeoDataFrame( + pandas.concat( + [ + places.loc[:, features], + field_to_frame(places.city_alternatenames) + .replace("tempvalue", None) + .add_prefix("city_altname"), + ], + axis=1, + ), + crs=CRS, + geometry=points_from_xy(places.longitude, places.latitude), + ) + + +def join_places_in_region(places: GeoDataFrame, regions: GeoDataFrame): + logger.debug("executing point-in-polygon spatial join") + joined = places.sjoin(regions, how="left", predicate="within") + missing = joined.index_right.isna() + if missing.any(): + joined.loc[missing, :] = ( + joined.loc[missing, :] + .drop(columns="index_right") + .sjoin_nearest(regions, how="left") + ) + return joined.drop(columns="index_right").reset_index(drop=True) + + +def build_gazetteer(geonames: pandas.DataFrame, gadm: GeoDataFrame) -> pandas.DataFrame: + logger.info("building gazetteer") + gazetteer = join_places_in_region( + build_place_gazetteer(geonames), + build_region_gazetteer(gadm), + ) + gazetteer.columns = gazetteer.columns.str.lower() + return gazetteer.drop(columns="geometry") diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..21ceff5 --- /dev/null +++ b/src/config.py @@ -0,0 +1,50 @@ +import logging.config +import os +import warnings + +import geopandas +import pyogrio + + +# remove user warnings +warnings.filterwarnings("ignore") + +# set default map projections +CRS = "EPSG:4326" + +# set default shapes IO engine +geopandas.options.io_engine = "pyogrio" +# do not preprocess polygons to save time +pyogrio.set_gdal_config_options({"OGR_ORGANIZE_POLYGONS": "SKIP"}) + + +DATA_PATH = os.path.join(os.path.dirname(__file__), "data") +os.makedirs(DATA_PATH, exist_ok=True) + + +LOGGING_CONFIG = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "format": "%(asctime)s %(levelname)-8s %(name)s.%(funcName)s %(message)s" + }, + }, + "handlers": { + "consolehandler": { + "level": "DEBUG", + "formatter": "default", + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", + }, + }, + "loggers": { + "gazetteer": {"handlers": ["consolehandler"], "level": "INFO", "propagate": False}, + "pyproj": {"level": "WARNING"} + }, + "root": { + "level": "DEBUG", + "handlers": ["consolehandler"] + } +} +logging.config.dictConfig(LOGGING_CONFIG) diff --git a/src/gadm.py b/src/gadm.py new file mode 100644 index 0000000..5e107bb --- /dev/null +++ b/src/gadm.py @@ -0,0 +1,66 @@ +import logging +import os + +import geopandas +from geopandas import GeoDataFrame + + +logger = logging.getLogger(__name__) + + +GADM_REMOTE_URL = "https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_410-gdb.zip" + + +def read_shapefile(filepath: str) -> geopandas.GeoDataFrame: + logger.debug("loading geometries from {}".format(filepath)) + return geopandas.read_file(filepath) + + +def build_gadm_filepath(url: str = GADM_REMOTE_URL) -> str: + return f"zip+{url}!/{os.path.basename(url).replace('-gdb.zip', '.gdb')}" + + +def remove_regions(df: GeoDataFrame) -> GeoDataFrame: + logger.debug("remove unused GADM regions") + to_drop = ["Antarctica", "Caspian Sea"] + return df.loc[~df.NAME_0.isin(to_drop), :].copy() + + +def update_unknown_region_meta(df: GeoDataFrame) -> GeoDataFrame: + logger.debug("update unknown region meta") + # Ukraine + df.loc[ + (df.NAME_1 == "?").idxmax(), ["NAME_1", "NL_NAME_1", "HASC_1", "ENGTYPE_1"] + ] = ["Kiev City", "Київ", "UA.KC", "Independent City"] + return df + + +def fill_missing_region_names(df: GeoDataFrame) -> GeoDataFrame: + logger.debug("fill missing GADM region names with country name") + regionless = df.NAME_1.isna() + df.loc[regionless, "NAME_1"] = df.loc[regionless, "NAME_0"] + return df + + +def rename_region_name_attributes(df: GeoDataFrame) -> GeoDataFrame: + logger.debug("rename wrong GADM region name attributes") + mapping = { + "Apulia": "Puglia", + "Sicily": "Sicilia", + } + df.NAME_1 = df.NAME_1.replace(mapping) + return df + + +def prepare_regions(regions: GeoDataFrame) -> GeoDataFrame: + return GeoDataFrame( + regions.pipe(remove_regions) # remove inhabitated polygons e.g. Antartica + .pipe(update_unknown_region_meta) # e.g. Ukraine has missing data + .pipe(fill_missing_region_names) # small countries are regionless + .pipe(rename_region_name_attributes) # rename regions using local knownledge + .reset_index(drop=True) + ) + + +def load_gadm(url: str = GADM_REMOTE_URL) -> GeoDataFrame: + return prepare_regions(read_shapefile(build_gadm_filepath(url))) diff --git a/src/geonames.py b/src/geonames.py new file mode 100644 index 0000000..daf836b --- /dev/null +++ b/src/geonames.py @@ -0,0 +1,34 @@ +import logging + +import pandas + + +logger = logging.getLogger(__name__) + + +GEONAMES_REMOTE_URL = "https://download.geonames.org/export/dump/cities15000.zip" + + +def load_geonames(url: str = GEONAMES_REMOTE_URL) -> pandas.DataFrame: + fields = [ + "city_id", + "city_name", + "city_asciiname", + "city_alternatenames", + "latitude", + "longitude", + "feature_class", + "feature_code", + "country_a2", + "cc2", + "admin1", + "admin2", + "admin3", + "admin4", + "population", + "elevation", + "dem", + "timezone", + "modification_date", + ] + return pandas.read_csv(url, sep="\t", names=fields) diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..0add771 --- /dev/null +++ b/src/main.py @@ -0,0 +1,34 @@ +from argparse import ArgumentParser +import logging + +from build import build_gazetteer, GAZETTEER_FILEPATH +from gadm import GADM_REMOTE_URL, load_gadm +from geonames import GEONAMES_REMOTE_URL, load_geonames + + +logger = logging.getLogger(__name__) + + +def main(): + parser = ArgumentParser( + description="Build gazetteer using GADM and Geonames data" + ) + parser.add_argument("--debug", action="store_true", default=False) + parser.add_argument("--gadm-url", default=GADM_REMOTE_URL, help="%(default)s") + parser.add_argument("--geonames-url", default=GEONAMES_REMOTE_URL, help="%(default)s") + args = parser.parse_args() + logger.setLevel(logging.DEBUG if args.debug else logging.INFO) + build_gazetteer( + load_geonames(args.geonames_url), + load_gadm(args.gadm_url), + ).to_json( + GAZETTEER_FILEPATH, + force_ascii=False, + lines=True, + orient="records", + ) + logger.info("done") + + +if __name__ == "__main__": + main()