Skip to content

Commit

Permalink
Gazetteer improvements:
Browse files Browse the repository at this point in the history
- gazetteer is released as json file to keep file structure when reading it
- fill nan with temp placeholder value to be replaced with None
- cache data folder during built to speed up releases
  • Loading branch information
panc86 committed Oct 3, 2023
1 parent 7475db4 commit c186d05
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 11 deletions.
16 changes: 13 additions & 3 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,19 @@ jobs:
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# files in data do not change unless
# repo URL is updated and new file is downloaded
- name: Cache Polygons
id: cache-polygons
uses: actions/cache@v3
with:
path: ./data
key: ${{ runner.os }}-polygons

- name: Build gazetteer
run: |
mkdir ./data
mkdir -p ./data
python build.py
- name: Changelog
uses: scottbrenner/generate-changelog-action@master
Expand All @@ -51,8 +61,8 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./data/gazetteer.csv.zip
asset_name: gazetteer.csv.zip
asset_path: ./data/gazetteer.json.zip
asset_name: gazetteer.json.zip
asset_content_type: application/zip
- name: Upload GADM Regions Asset
id: upload-region-release-asset
Expand Down
18 changes: 10 additions & 8 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def build_place_gazetteer(points: pandas.DataFrame) -> pandas.DataFrame:
ascii_places = points.city_asciiname.rename("place_name_ascii")
# explode_alternate_names
alt_places = pandas.DataFrame(
points.city_alternatenames.fillna('').str.split(",").tolist()
).add_prefix("place_name_alt")
points.city_alternatenames.fillna('tempvalue').str.split(",").tolist()
).replace('tempvalue', None).add_prefix("place_name_alt")
# build gazetteer
return pandas.concat([points.latitude.astype(float), points.longitude.astype(float), places, ascii_places, alt_places], axis=1)

Expand All @@ -160,16 +160,18 @@ def build_region_gazetteer(polygons: pandas.DataFrame) -> pandas.DataFrame:
logging.debug("build region gazetteer")
prefix = "region_"
# concatenate lower level first to be stacked right to places in next step
regions = pandas.DataFrame()
varnames = pandas.DataFrame()
for lev in range(4):
col = "VARNAME_{}".format(lev+1)
regions = pandas.concat([
regions,
pandas.DataFrame(polygons[col].fillna('').str.split("|").tolist()).add_prefix("".join([prefix, col, "_alt"]))
varnames = pandas.concat([
varnames,
pandas.DataFrame(
polygons[col].fillna('tempvalue').str.split("|").tolist()
).add_prefix("".join([prefix, col, "_alt"]))
], axis=1)
# build region gazetteer
gazetteer = pandas.concat([
regions,
varnames.replace('tempvalue', None),
polygons.loc[:, ["NAME_5","NAME_4","NL_NAME_3","NAME_3","NL_NAME_2","NAME_2","NL_NAME_1","NAME_1"]].add_prefix(prefix),
polygons.NAME_0,
polygons.GID_0,
Expand All @@ -187,7 +189,7 @@ def build_gazetteer(points: pandas.DataFrame, polygons: pandas.DataFrame) -> Non
places_gazetteer["polygon_index"] = points_in_polygons_lookup(points, polygons)
# stack region names right to follow the place size logic place > region_N > Country
gazetteer = places_gazetteer.join(regions_gazetteer, on="polygon_index").drop(columns=["polygon_index"])
gazetteer.to_csv(os.path.join(DATA_PATH, "gazetteer.csv.zip"), index=False)
gazetteer.to_json(os.path.join(DATA_PATH, "gazetteer.json.zip"), orient="records", date_format=None, lines=True, force_ascii=False)


def build_regions(polygons: pandas.DataFrame) -> None:
Expand Down

0 comments on commit c186d05

Please sign in to comment.