Skip to content

Commit

Permalink
organize code into modules
Browse files Browse the repository at this point in the history
  • Loading branch information
panc86 committed May 30, 2024
1 parent 4a69ff7 commit 8860085
Show file tree
Hide file tree
Showing 8 changed files with 283 additions and 217 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ poetry install
Build gazetteer

```shell
poetry run python -B build.py
poetry run python -B src/main.py
```
212 changes: 0 additions & 212 deletions build.py

This file was deleted.

5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
[tool.poetry]
name = "gazetteer"
version = "0.2.0"
version = "0.3.0"
description = "Gazetteer from GADM regions and Geonames cities 15K"
authors = ["Emanuele Panizio <panc86@gmail.com>"]
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.10"
requests = "^2.31.0"
geopandas = "^0.14.3"
pandas = "^2.2.1"
pyogrio = "^0.7.2"
Rtree = "^1.2.0"
tqdm = "^4.66.2"

[tool.poetry.group.dev.dependencies]
ipykernel = "^6.29.4"
Expand Down
97 changes: 97 additions & 0 deletions src/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import logging
import os

from geopandas import GeoDataFrame, points_from_xy
import pandas

from config import CRS, DATA_PATH


logger = logging.getLogger(__name__)


GAZETTEER_FILEPATH = os.path.join(DATA_PATH, "gazetteer.json.zip")


def field_to_frame(field: pandas.Series) -> pandas.DataFrame:
return pandas.DataFrame(field.fillna("tempvalue").str.split(",").tolist())


def explode_region_names(regions: GeoDataFrame) -> pandas.DataFrame:
fields = ["VARNAME_1", "VARNAME_2", "VARNAME_3", "VARNAME_4"]
return pandas.concat(
[
field_to_frame(regions[field]).add_prefix(f"REGION_{field}_ALT")
for field in fields
],
axis=1,
).replace("tempvalue", None)


def build_region_gazetteer(regions: GeoDataFrame) -> GeoDataFrame:
logger.debug("building regions gazetteer")
return GeoDataFrame(
pandas.concat(
[
regions.loc[:, ["UID", "NAME_0", "GID_0"]],
regions.loc[
:,
[
"NAME_1",
"NL_NAME_1",
"NAME_2",
"NL_NAME_2",
"NAME_3",
"NL_NAME_3",
"NAME_4",
"NAME_5",
],
].add_prefix("REGION_"),
explode_region_names(regions),
],
axis=1,
),
crs=CRS,
geometry=regions.geometry,
)


def build_place_gazetteer(places: pandas.DataFrame) -> GeoDataFrame:
logger.debug("building places gazetteer")
features = ["latitude", "longitude", "city_name", "city_asciiname"]
return GeoDataFrame(
pandas.concat(
[
places.loc[:, features],
field_to_frame(places.city_alternatenames)
.replace("tempvalue", None)
.add_prefix("city_altname"),
],
axis=1,
),
crs=CRS,
geometry=points_from_xy(places.longitude, places.latitude),
)


def join_places_in_region(places: GeoDataFrame, regions: GeoDataFrame):
logger.debug("executing point-in-polygon spatial join")
joined = places.sjoin(regions, how="left", predicate="within")
missing = joined.index_right.isna()
if missing.any():
joined.loc[missing, :] = (
joined.loc[missing, :]
.drop(columns="index_right")
.sjoin_nearest(regions, how="left")
)
return joined.drop(columns="index_right").reset_index(drop=True)


def build_gazetteer(geonames: pandas.DataFrame, gadm: GeoDataFrame) -> pandas.DataFrame:
logger.info("building gazetteer")
gazetteer = join_places_in_region(
build_place_gazetteer(geonames),
build_region_gazetteer(gadm),
)
gazetteer.columns = gazetteer.columns.str.lower()
return gazetteer.drop(columns="geometry")
50 changes: 50 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import logging.config
import os
import warnings

import geopandas
import pyogrio


# remove user warnings
warnings.filterwarnings("ignore")

# set default map projections
CRS = "EPSG:4326"

# set default shapes IO engine
geopandas.options.io_engine = "pyogrio"
# do not preprocess polygons to save time
pyogrio.set_gdal_config_options({"OGR_ORGANIZE_POLYGONS": "SKIP"})


DATA_PATH = os.path.join(os.path.dirname(__file__), "data")
os.makedirs(DATA_PATH, exist_ok=True)


LOGGING_CONFIG = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"default": {
"format": "%(asctime)s %(levelname)-8s %(name)s.%(funcName)s %(message)s"
},
},
"handlers": {
"consolehandler": {
"level": "DEBUG",
"formatter": "default",
"class": "logging.StreamHandler",
"stream": "ext://sys.stdout",
},
},
"loggers": {
"gazetteer": {"handlers": ["consolehandler"], "level": "INFO", "propagate": False},
"pyproj": {"level": "WARNING"}
},
"root": {
"level": "DEBUG",
"handlers": ["consolehandler"]
}
}
logging.config.dictConfig(LOGGING_CONFIG)
Loading

0 comments on commit 8860085

Please sign in to comment.