From d0f74e4a6881e6acb374e16dd440a21cf25e7b56 Mon Sep 17 00:00:00 2001 From: Emanuele Panizio Date: Sat, 30 Sep 2023 23:11:45 +0200 Subject: [PATCH] Gazetteer v0.1.1 - release as zip file - release gadm regions shapefile for products generations - fix region naming issues --- .github/workflows/release.yaml | 22 ++- .gitignore | 2 +- build.py | 251 +++++++++++++++++---------------- 3 files changed, 147 insertions(+), 128 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 0f3d704..2ed313e 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -44,13 +44,23 @@ jobs: ${{ steps.Changelog.outputs.changelog }} draft: false prerelease: false - - name: Upload Release Asset - id: upload-release-asset + - name: Upload Gazetteer Asset + id: upload-gazetteer-release-asset uses: actions/upload-release-asset@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - upload_url: ${{ steps.create_release.outputs.upload_url }} # This pulls from the CREATE RELEASE step above, referencing it's ID to get its outputs object, which include a `upload_url`. See this blog post for more info: https://jasonet.co/posts/new-features-of-github-actions/#passing-data-to-future-steps - asset_path: ./data/gazetteer.csv - asset_name: gazetteer.csv - asset_content_type: text/csv + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: ./data/gazetteer.csv.zip + asset_name: gazetteer.csv.zip + asset_content_type: application/zip + - name: Upload GADM Regions Asset + id: upload-region-release-asset + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: ./data/gadm41_regions.shp.zip + asset_name: gadm41_regions.shp.zip + asset_content_type: application/zip diff --git a/.gitignore b/.gitignore index 4883468..b8f325f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,7 @@ __pycache__ *.egg-info *.ipynb *.ipynb_checkpoints -.pyenv +.pyve # Release data/ diff --git a/build.py b/build.py index 419767e..61a33ca 100644 --- a/build.py +++ b/build.py @@ -1,114 +1,99 @@ +import logging import os -import numpy import pandas import zipfile import requests +import warnings os.environ['USE_PYGEOS'] = '0' import geopandas from tqdm import tqdm -# paths -data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") +# remove user warnings +warnings.filterwarnings("ignore", category=UserWarning) +# set fiona logging level to error to reduce verbosity +logging.getLogger("fiona").setLevel(logging.ERROR) -def download_data(url, filepath): +# data path +DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") + + +def download_geometries(url: str) -> str: """Download external data.""" + path = os.path.join(DATA_PATH, os.path.basename(url)) + logging.info("⏳ downloading from {url} to {path}".format(url=url, path=path)) + if os.path.exists(path): + return path with requests.get(url, stream=True) as r: r.raise_for_status() chunk_size = 1024 total_bytes = int(r.headers.get('content-length', 0)) - with open(filepath, 'wb') as f: - with tqdm(unit='B', unit_scale=True, unit_divisor=chunk_size, miniters=1, desc=filepath, total=total_bytes) as p: + with open(path, 'wb') as f: + with tqdm(unit='B', unit_scale=True, unit_divisor=chunk_size, miniters=1, desc=path, total=total_bytes) as p: for chunk in r.iter_content(chunk_size=chunk_size*10): f.write(chunk) p.update(len(chunk)) + return path -def extract_data(filepath, fn): - # extract +def unzip_gadm(filepath): with zipfile.ZipFile(filepath, 'r') as archive: - for f in archive.infolist(): - f.filename = fn - archive.extract(f, path=data_dir) + archive.extractall(path=os.path.dirname(filepath)) + logging.debug("unzipped {}".format(filepath)) + return filepath.replace("-gdb.zip", ".gdb") + + +def load_gadm(path: str) -> pandas.DataFrame: + return geopandas.read_file(path, driver='FileGDB') + + +def remove_polygons(df: pandas.DataFrame) -> pandas.DataFrame: + """Remove polygons without social media activity""" + to_drop = ["Antartica", "Caspian Sea"] + return df.loc[~df.NAME_0.isin(to_drop), :].copy() -def apply_polygon_names_mapping(polygons): - """Fix wrong polygon name attributes""" +def update_polygon_meta(df: pandas.DataFrame) -> pandas.DataFrame: + """Update missing metadata""" + # Ukraine + df.loc[(df.NAME_1 == "?").idxmax(), ["NAME_1","NL_NAME_1","HASC_1","ENGTYPE_1"]] = ["Kiev City", "Київ", "UA.KC", "Independent City"] + return df + + +def fill_missing_region_names(df: pandas.DataFrame) -> pandas.DataFrame: + """Fill missing region names with country name""" + regionless = df.NAME_1.isna() + df.loc[regionless, "NAME_1"] = df.loc[regionless, "NAME_0"] + return df + + +def rename_region_name_attributes(df: pandas.DataFrame) -> pandas.DataFrame: + """Rename wrong region name attributes""" mapping = { "Apulia": "Puglia", "Sicily": "Sicilia", } - polygons.NAME_1 = polygons.NAME_1.replace(mapping) - return polygons - - -def drop_polygons(polygons): - to_drop = [ - "Antarctica", - ] - indices = polygons[polygons.NAME_0.isin(to_drop)].index - return polygons.drop(indices) + df.NAME_1 = df.NAME_1.replace(mapping) + return df -def get_gadm_polygons(): - """Load polygons from GADM GeoPackage data. https://gadm.org/metadata.html""" - print("⏳ Fetching GADM polygons...") - dwn_filepath = os.path.join(data_dir, "gadm-gpkg.zip") - if not os.path.exists(dwn_filepath): - download_data("https://geodata.ucdavis.edu/gadm/gadm4.0/gadm404-gpkg.zip", dwn_filepath) - ext_filepath = os.path.join(data_dir, "gadm.gpkg") - if not os.path.exists(ext_filepath): - extract_data(dwn_filepath, "gadm.gpkg") - return geopandas.read_file(ext_filepath).replace(" ", numpy.nan).replace("?", numpy.nan).replace("n.a.", numpy.nan) +def download_and_prepare_gadm(): + url = "https://geodata.ucdavis.edu/gadm/gadm4.1/gadm_410-gdb.zip" + df = load_gadm(unzip_gadm(download_geometries(url))) + return ( + df.pipe(remove_polygons) # remove inhabitated polygons e.g. Antartica + .pipe(update_polygon_meta) # e.g. Ukraine has missing data + .pipe(fill_missing_region_names) # small countries are regionless + .pipe(rename_region_name_attributes) # rename regions using local knownledge + ) -def get_geonames_cities15k(): +def download_geonames_cities_15k(): """ - # Geonames DB @ http://download.geonames.org/export/dump/cities15000.zip - - The main 'geoname' table has the following fields : - --------------------------------------------------- - geonameid : integer id of record in geonames database - name : name of geographical point (utf8) varchar(200) - asciiname : name of geographical point in plain ascii characters, varchar(200) - alternatenames : alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000) - latitude : latitude in decimal degrees (wgs84) - longitude : longitude in decimal degrees (wgs84) - feature class : see http://www.geonames.org/export/codes.html, char(1) - feature code : see http://www.geonames.org/export/codes.html, varchar(10) - country a2 : ISO-3166 2-letter country code, 2 characters - cc2 : alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters - admin1 code : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20) - admin2 code : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) - admin3 code : code for third level administrative division, varchar(20) - admin4 code : code for fourth level administrative division, varchar(20) - population : bigint (8 byte int) - elevation : in meters, integer - dem : digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat. - timezone : the iana timezone id (see file timeZone.txt) varchar(40) - modification date : date of last modification in yyyy-MM-dd format - ``` - - ### NOTES - - Following field names have been modified for convenience and clarity - - #### cities15000.txt - ``` - - geonameid > city_id - - name > city_name - - asciiname > city_asciiname - - alternatenames > city_alternatenames + Geonames DB @ http://download.geonames.org/export/dump/cities15000.zip """ - print("⏳ Fetching (geonames) cities15k points...") - dwn_filepath = os.path.join(data_dir, "cities15000.zip") - if not os.path.exists(dwn_filepath): - download_data("http://download.geonames.org/export/dump/cities15000.zip", dwn_filepath) - ext_filepath = os.path.join(data_dir, "cities15000.txt") - if not os.path.exists(ext_filepath): - extract_data(dwn_filepath, "cities15000.txt") fields = [ # from cities15000.txt (Geonames) "city_id", @@ -131,68 +116,92 @@ def get_geonames_cities15k(): "timezone", "modification_date", ] - return pandas.read_csv(ext_filepath, sep="\t", names=fields) + path = download_geometries("http://download.geonames.org/export/dump/cities15000.zip") + cities = pandas.read_csv(path, sep="\t", names=fields) + return ( + geopandas.GeoDataFrame(cities.drop(columns=["latitude", "longitude"])) + .set_geometry(geopandas.points_from_xy(cities.longitude, cities.latitude)) + ) -def build_places_gazetteer(points): - features = [ - "latitude", - "longitude", - "city_name", - "city_alternatenames", - ] - gazetteer = points.loc[:, features].rename(columns={"city_name": "place_name"}) - altname = pandas.DataFrame(gazetteer.pop("city_alternatenames").fillna('').str.split(",").tolist()) - altname.columns = ["place_name_alt{}".format(index) for index in range(altname.shape[1])] - return pandas.concat([gazetteer, altname], axis=1).convert_dtypes() +def points_in_polygons_lookup(points, polygons) -> pandas.Series: + logging.debug("⏳ executing point-in-polygon spatial join") + # exec spatial join and return index_right i.e. the polygon ID containing the point + points_geo = points[["geometry"]].set_crs(polygons.crs) + polygons_geo = polygons[["geometry"]] + spatial_lookup = points_geo.sjoin(polygons_geo, how="left", predicate="within")["index_right"] + # if NaN in data not all points were within a polygon e.g. coastal cities off-shore + not_in_polygon = spatial_lookup.isna() + if not_in_polygon.any(): + spatial_lookup[not_in_polygon] = points_geo[not_in_polygon].sjoin_nearest(polygons_geo, how="left")["index_right"] + return spatial_lookup.astype(int) + + +def build_place_gazetteer(points: pandas.DataFrame) -> pandas.DataFrame: + # extract places + places = points.city_name.rename("place_name") + # explode_alternate_names + alt_places = pandas.DataFrame( + points.city_alternatenames.fillna('').str.split(",").tolist() + ).add_prefix("place_name_alt") + # build gazetteer + return pandas.concat([points.geometry, places, alt_places], axis=1) -def build_regions_gazetteer(polygons): - gazetteer = pandas.DataFrame(index=polygons.index) - gazetteer["country_code"] = polygons.GID_0 - gazetteer["country_name"] = polygons.NAME_0 +def build_region_gazetteer(polygons: pandas.DataFrame) -> pandas.DataFrame: prefix = "region_" - gazetteer = pandas.concat([gazetteer, polygons.loc[:, ["NAME_1", "NL_NAME_1", "NAME_2", "NL_NAME_2", "NAME_3", "NL_NAME_3", "NAME_4", "NAME_5"]].add_prefix(prefix)], axis=1) + gazetteer = pandas.concat([ + polygons.GID_0, + polygons.NAME_0, + polygons.loc[:, ["NAME_1","NL_NAME_1","NAME_2","NL_NAME_2","NAME_3","NL_NAME_3","NAME_4","NAME_5"] + ].add_prefix(prefix)], axis=1) for lev in range(4): col = "VARNAME_{}".format(lev+1) - altname = pandas.DataFrame(polygons[col].fillna('').str.split("|").tolist()) - altname.columns = [prefix+"{col}_alt{index}".format(col=col, index=index) for index in range(altname.shape[1])] + level_prefix = "{prefix}_{col}_alt".format(prefix=prefix, col=col) + altname = pandas.DataFrame(polygons[col].fillna('').str.split("|").tolist()).add_prefix(level_prefix) gazetteer = pandas.concat([gazetteer, altname], axis=1) - gazetteer.columns = gazetteer.columns.str.lower() - return gazetteer.convert_dtypes() + return gazetteer -def point_in_polygon(points, polygons): - print("⏳ Executing point-in-polygon spatial join...") - # exec spatial join and return index_right i.e. the polygon ID containing the point - joined = points.sjoin(polygons, how="left", predicate="within") - # spatially join points outside polygon with nearest polygon and update missing records - not_in_polygon_mask = joined.index_right.isna() - joined.loc[not_in_polygon_mask, :] = points[not_in_polygon_mask].sjoin_nearest(polygons, how="left") - return joined +def build_gazetteer(points: pandas.DataFrame, polygons: pandas.DataFrame) -> None: + places_gazetteer = build_place_gazetteer(points) + regions_gazetteer = build_region_gazetteer(polygons) + places_gazetteer["polygon_index"] = points_in_polygons_lookup(points, polygons) + gazetteer = places_gazetteer.join(regions_gazetteer, on="polygon_index").drop(columns=["polygon_index"]) + gazetteer.to_csv(os.path.join(DATA_PATH, "gazetteer.csv.zip"), index=False) + + +def build_regions(polygons: pandas.DataFrame) -> None: + """Build regions from dissolved polygons""" + features = ["geometry","UID","NAME_1","NL_NAME_1","NAME_0","GID_0"] + # aggregate regions using GADM NAME_1 level shape(3555, 6) + polygons.dissolve( + by="NAME_1", aggfunc='first', as_index=False, level=None, sort=True, observed=False, dropna=False + )[features].to_file(os.path.join(DATA_PATH, 'gadm41_regions.shp.zip')) def main(): - # GADM polygons - polygons = get_gadm_polygons().pipe(apply_polygon_names_mapping).pipe(drop_polygons) - polygons_geom = geopandas.GeoDataFrame(polygons.pop("geometry")) - # geonames (cities) points - cities = get_geonames_cities15k() - cities_geom = geopandas.GeoDataFrame(index=cities.index).set_geometry(geopandas.points_from_xy(cities.longitude, cities.latitude)).set_crs(polygons_geom.crs) - # spatial join - joined = point_in_polygon(cities_geom, polygons_geom) + # download and prepare shapes data + polygons = download_and_prepare_gadm() + points = download_geonames_cities_15k() # build gazetteer - regions_g = build_regions_gazetteer(polygons) - places_g = build_places_gazetteer(cities) - # add polygon lookup index i.e. the id of the polygon containing the point - fn = os.path.join(data_dir, "gazetteer.csv") - places_g["polygon_index"] = joined.index_right.astype(int) - gazetteer = places_g.join(regions_g, on="polygon_index").drop(columns=["polygon_index"]) - gazetteer.to_csv(fn, index=False) - print("⌛ {fn} built!".format(fn=fn)) + build_gazetteer(points, polygons) + # build regions + build_regions(polygons) if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + prog="Gazetteer", + description="Build a region shapefile and places gazetteer using GADM and Geonames data." + ) + parser.add_argument( + "--debug", action="store_true", default=False, help="Enables debugging." + ) + args = parser.parse_args() + logging.root.setLevel(logging.DEBUG if args.debug else logging.INFO) main()