Skip to content

Commit

Permalink
Gazetteer v0.1.1
Browse files Browse the repository at this point in the history
- release as zip file
- release gadm regions shapefile for products generations
- fix region naming issues
  • Loading branch information
panc86 committed Sep 30, 2023
1 parent 8ec6a53 commit d0f74e4
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 128 deletions.
22 changes: 16 additions & 6 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,23 @@ jobs:
${{ steps.Changelog.outputs.changelog }}
draft: false
prerelease: false
- name: Upload Release Asset
id: upload-release-asset
- name: Upload Gazetteer Asset
id: upload-gazetteer-release-asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }} # This pulls from the CREATE RELEASE step above, referencing it's ID to get its outputs object, which include a `upload_url`. See this blog post for more info: https://jasonet.co/posts/new-features-of-github-actions/#passing-data-to-future-steps
asset_path: ./data/gazetteer.csv
asset_name: gazetteer.csv
asset_content_type: text/csv
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./data/gazetteer.csv.zip
asset_name: gazetteer.csv.zip
asset_content_type: application/zip
- name: Upload GADM Regions Asset
id: upload-region-release-asset
uses: actions/upload-release-asset@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
upload_url: ${{ steps.create_release.outputs.upload_url }}
asset_path: ./data/gadm41_regions.shp.zip
asset_name: gadm41_regions.shp.zip
asset_content_type: application/zip
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ __pycache__
*.egg-info
*.ipynb
*.ipynb_checkpoints
.pyenv
.pyve

# Release
data/
Expand Down
251 changes: 130 additions & 121 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,114 +1,99 @@
import logging
import os
import numpy
import pandas
import zipfile
import requests
import warnings

os.environ['USE_PYGEOS'] = '0'
import geopandas

from tqdm import tqdm

# paths
data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
# remove user warnings
warnings.filterwarnings("ignore", category=UserWarning)

# set fiona logging level to error to reduce verbosity
logging.getLogger("fiona").setLevel(logging.ERROR)

def download_data(url, filepath):
# data path
DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


def download_geometries(url: str) -> str:
"""Download external data."""
path = os.path.join(DATA_PATH, os.path.basename(url))
logging.info("⏳ downloading from {url} to {path}".format(url=url, path=path))
if os.path.exists(path):
return path
with requests.get(url, stream=True) as r:
r.raise_for_status()
chunk_size = 1024
total_bytes = int(r.headers.get('content-length', 0))
with open(filepath, 'wb') as f:
with tqdm(unit='B', unit_scale=True, unit_divisor=chunk_size, miniters=1, desc=filepath, total=total_bytes) as p:
with open(path, 'wb') as f:
with tqdm(unit='B', unit_scale=True, unit_divisor=chunk_size, miniters=1, desc=path, total=total_bytes) as p:
for chunk in r.iter_content(chunk_size=chunk_size*10):
f.write(chunk)
p.update(len(chunk))
return path


def extract_data(filepath, fn):
# extract
def unzip_gadm(filepath):
with zipfile.ZipFile(filepath, 'r') as archive:
for f in archive.infolist():
f.filename = fn
archive.extract(f, path=data_dir)
archive.extractall(path=os.path.dirname(filepath))
logging.debug("unzipped {}".format(filepath))
return filepath.replace("-gdb.zip", ".gdb")


def load_gadm(path: str) -> pandas.DataFrame:
return geopandas.read_file(path, driver='FileGDB')


def remove_polygons(df: pandas.DataFrame) -> pandas.DataFrame:
"""Remove polygons without social media activity"""
to_drop = ["Antartica", "Caspian Sea"]
return df.loc[~df.NAME_0.isin(to_drop), :].copy()


def apply_polygon_names_mapping(polygons):
"""Fix wrong polygon name attributes"""
def update_polygon_meta(df: pandas.DataFrame) -> pandas.DataFrame:
"""Update missing metadata"""
# Ukraine
df.loc[(df.NAME_1 == "?").idxmax(), ["NAME_1","NL_NAME_1","HASC_1","ENGTYPE_1"]] = ["Kiev City", "Київ", "UA.KC", "Independent City"]
return df


def fill_missing_region_names(df: pandas.DataFrame) -> pandas.DataFrame:
"""Fill missing region names with country name"""
regionless = df.NAME_1.isna()
df.loc[regionless, "NAME_1"] = df.loc[regionless, "NAME_0"]
return df


def rename_region_name_attributes(df: pandas.DataFrame) -> pandas.DataFrame:
"""Rename wrong region name attributes"""
mapping = {
"Apulia": "Puglia",
"Sicily": "Sicilia",
}
polygons.NAME_1 = polygons.NAME_1.replace(mapping)
return polygons


def drop_polygons(polygons):
to_drop = [
"Antarctica",
]
indices = polygons[polygons.NAME_0.isin(to_drop)].index
return polygons.drop(indices)
df.NAME_1 = df.NAME_1.replace(mapping)
return df


def get_gadm_polygons():
"""Load polygons from GADM GeoPackage data. https://gadm.org/metadata.html"""
print("⏳ Fetching GADM polygons...")
dwn_filepath = os.path.join(data_dir, "gadm-gpkg.zip")
if not os.path.exists(dwn_filepath):
download_data("https://geodata.ucdavis.edu/gadm/gadm4.0/gadm404-gpkg.zip", dwn_filepath)
ext_filepath = os.path.join(data_dir, "gadm.gpkg")
if not os.path.exists(ext_filepath):
extract_data(dwn_filepath, "gadm.gpkg")
return geopandas.read_file(ext_filepath).replace(" ", numpy.nan).replace("?", numpy.nan).replace("n.a.", numpy.nan)
def download_and_prepare_gadm():
url = "https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_410-gdb.zip"
df = load_gadm(unzip_gadm(download_geometries(url)))
return (
df.pipe(remove_polygons) # remove inhabitated polygons e.g. Antartica
.pipe(update_polygon_meta) # e.g. Ukraine has missing data
.pipe(fill_missing_region_names) # small countries are regionless
.pipe(rename_region_name_attributes) # rename regions using local knownledge
)


def get_geonames_cities15k():
def download_geonames_cities_15k():
"""
# Geonames DB @ http://download.geonames.org/export/dump/cities15000.zip
The main 'geoname' table has the following fields :
---------------------------------------------------
geonameid : integer id of record in geonames database
name : name of geographical point (utf8) varchar(200)
asciiname : name of geographical point in plain ascii characters, varchar(200)
alternatenames : alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)
latitude : latitude in decimal degrees (wgs84)
longitude : longitude in decimal degrees (wgs84)
feature class : see http://www.geonames.org/export/codes.html, char(1)
feature code : see http://www.geonames.org/export/codes.html, varchar(10)
country a2 : ISO-3166 2-letter country code, 2 characters
cc2 : alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters
admin1 code : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)
admin2 code : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)
admin3 code : code for third level administrative division, varchar(20)
admin4 code : code for fourth level administrative division, varchar(20)
population : bigint (8 byte int)
elevation : in meters, integer
dem : digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.
timezone : the iana timezone id (see file timeZone.txt) varchar(40)
modification date : date of last modification in yyyy-MM-dd format
```
### NOTES
Following field names have been modified for convenience and clarity
#### cities15000.txt
```
- geonameid > city_id
- name > city_name
- asciiname > city_asciiname
- alternatenames > city_alternatenames
Geonames DB @ http://download.geonames.org/export/dump/cities15000.zip
"""
print("⏳ Fetching (geonames) cities15k points...")
dwn_filepath = os.path.join(data_dir, "cities15000.zip")
if not os.path.exists(dwn_filepath):
download_data("http://download.geonames.org/export/dump/cities15000.zip", dwn_filepath)
ext_filepath = os.path.join(data_dir, "cities15000.txt")
if not os.path.exists(ext_filepath):
extract_data(dwn_filepath, "cities15000.txt")
fields = [
# from cities15000.txt (Geonames)
"city_id",
Expand All @@ -131,68 +116,92 @@ def get_geonames_cities15k():
"timezone",
"modification_date",
]
return pandas.read_csv(ext_filepath, sep="\t", names=fields)
path = download_geometries("http://download.geonames.org/export/dump/cities15000.zip")
cities = pandas.read_csv(path, sep="\t", names=fields)
return (
geopandas.GeoDataFrame(cities.drop(columns=["latitude", "longitude"]))
.set_geometry(geopandas.points_from_xy(cities.longitude, cities.latitude))
)


def build_places_gazetteer(points):
features = [
"latitude",
"longitude",
"city_name",
"city_alternatenames",
]
gazetteer = points.loc[:, features].rename(columns={"city_name": "place_name"})
altname = pandas.DataFrame(gazetteer.pop("city_alternatenames").fillna('').str.split(",").tolist())
altname.columns = ["place_name_alt{}".format(index) for index in range(altname.shape[1])]
return pandas.concat([gazetteer, altname], axis=1).convert_dtypes()
def points_in_polygons_lookup(points, polygons) -> pandas.Series:
logging.debug("⏳ executing point-in-polygon spatial join")
# exec spatial join and return index_right i.e. the polygon ID containing the point
points_geo = points[["geometry"]].set_crs(polygons.crs)
polygons_geo = polygons[["geometry"]]
spatial_lookup = points_geo.sjoin(polygons_geo, how="left", predicate="within")["index_right"]
# if NaN in data not all points were within a polygon e.g. coastal cities off-shore
not_in_polygon = spatial_lookup.isna()
if not_in_polygon.any():
spatial_lookup[not_in_polygon] = points_geo[not_in_polygon].sjoin_nearest(polygons_geo, how="left")["index_right"]
return spatial_lookup.astype(int)


def build_place_gazetteer(points: pandas.DataFrame) -> pandas.DataFrame:
# extract places
places = points.city_name.rename("place_name")
# explode_alternate_names
alt_places = pandas.DataFrame(
points.city_alternatenames.fillna('').str.split(",").tolist()
).add_prefix("place_name_alt")
# build gazetteer
return pandas.concat([points.geometry, places, alt_places], axis=1)


def build_regions_gazetteer(polygons):
gazetteer = pandas.DataFrame(index=polygons.index)
gazetteer["country_code"] = polygons.GID_0
gazetteer["country_name"] = polygons.NAME_0
def build_region_gazetteer(polygons: pandas.DataFrame) -> pandas.DataFrame:
prefix = "region_"
gazetteer = pandas.concat([gazetteer, polygons.loc[:, ["NAME_1", "NL_NAME_1", "NAME_2", "NL_NAME_2", "NAME_3", "NL_NAME_3", "NAME_4", "NAME_5"]].add_prefix(prefix)], axis=1)
gazetteer = pandas.concat([
polygons.GID_0,
polygons.NAME_0,
polygons.loc[:, ["NAME_1","NL_NAME_1","NAME_2","NL_NAME_2","NAME_3","NL_NAME_3","NAME_4","NAME_5"]
].add_prefix(prefix)], axis=1)

for lev in range(4):
col = "VARNAME_{}".format(lev+1)
altname = pandas.DataFrame(polygons[col].fillna('').str.split("|").tolist())
altname.columns = [prefix+"{col}_alt{index}".format(col=col, index=index) for index in range(altname.shape[1])]
level_prefix = "{prefix}_{col}_alt".format(prefix=prefix, col=col)
altname = pandas.DataFrame(polygons[col].fillna('').str.split("|").tolist()).add_prefix(level_prefix)
gazetteer = pandas.concat([gazetteer, altname], axis=1)

gazetteer.columns = gazetteer.columns.str.lower()
return gazetteer.convert_dtypes()
return gazetteer


def point_in_polygon(points, polygons):
print("⏳ Executing point-in-polygon spatial join...")
# exec spatial join and return index_right i.e. the polygon ID containing the point
joined = points.sjoin(polygons, how="left", predicate="within")
# spatially join points outside polygon with nearest polygon and update missing records
not_in_polygon_mask = joined.index_right.isna()
joined.loc[not_in_polygon_mask, :] = points[not_in_polygon_mask].sjoin_nearest(polygons, how="left")
return joined
def build_gazetteer(points: pandas.DataFrame, polygons: pandas.DataFrame) -> None:
places_gazetteer = build_place_gazetteer(points)
regions_gazetteer = build_region_gazetteer(polygons)
places_gazetteer["polygon_index"] = points_in_polygons_lookup(points, polygons)
gazetteer = places_gazetteer.join(regions_gazetteer, on="polygon_index").drop(columns=["polygon_index"])
gazetteer.to_csv(os.path.join(DATA_PATH, "gazetteer.csv.zip"), index=False)


def build_regions(polygons: pandas.DataFrame) -> None:
"""Build regions from dissolved polygons"""
features = ["geometry","UID","NAME_1","NL_NAME_1","NAME_0","GID_0"]
# aggregate regions using GADM NAME_1 level shape(3555, 6)
polygons.dissolve(
by="NAME_1", aggfunc='first', as_index=False, level=None, sort=True, observed=False, dropna=False
)[features].to_file(os.path.join(DATA_PATH, 'gadm41_regions.shp.zip'))


def main():
# GADM polygons
polygons = get_gadm_polygons().pipe(apply_polygon_names_mapping).pipe(drop_polygons)
polygons_geom = geopandas.GeoDataFrame(polygons.pop("geometry"))
# geonames (cities) points
cities = get_geonames_cities15k()
cities_geom = geopandas.GeoDataFrame(index=cities.index).set_geometry(geopandas.points_from_xy(cities.longitude, cities.latitude)).set_crs(polygons_geom.crs)
# spatial join
joined = point_in_polygon(cities_geom, polygons_geom)
# download and prepare shapes data
polygons = download_and_prepare_gadm()
points = download_geonames_cities_15k()
# build gazetteer
regions_g = build_regions_gazetteer(polygons)
places_g = build_places_gazetteer(cities)
# add polygon lookup index i.e. the id of the polygon containing the point
fn = os.path.join(data_dir, "gazetteer.csv")
places_g["polygon_index"] = joined.index_right.astype(int)
gazetteer = places_g.join(regions_g, on="polygon_index").drop(columns=["polygon_index"])
gazetteer.to_csv(fn, index=False)
print("⌛ {fn} built!".format(fn=fn))
build_gazetteer(points, polygons)
# build regions
build_regions(polygons)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(
prog="Gazetteer",
description="Build a region shapefile and places gazetteer using GADM and Geonames data."
)
parser.add_argument(
"--debug", action="store_true", default=False, help="Enables debugging."
)
args = parser.parse_args()
logging.root.setLevel(logging.DEBUG if args.debug else logging.INFO)
main()

0 comments on commit d0f74e4

Please sign in to comment.