Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions prototype/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/.iqb
236 changes: 195 additions & 41 deletions prototype/natural_earth/extract_country_geojson.py
Original file line number Diff line number Diff line change
@@ -1,99 +1,253 @@
"""
Script to extract country-specific GeoJSON files from Natural Earth data.
Scans cache dir for files like zm_2024_10.json, then extracts matching
country boundaries from ne_10m_admin_0_countries shapefile.
Script to extract subdivision-specific GeoJSON files from Natural Earth admin1 data.
Uses ne_10m_admin_1_states_provinces shapefile to get state/province boundaries.
"""

import re
from pathlib import Path

import geopandas as gpd
import pycountry


def extract_geojsons(
cache_dir: str = "/data/cache/v0",
ne_file: str = "ne_10m_admin_0_countries.shp",
output_dir: str = "geojson_countries",
def extract_admin1_geojsons(
ne_file: str = "ne_10m_admin_1_states_provinces.shp",
output_dir: str = "geojson_admin1",
countries: list[str] | None = None,
) -> None:
"""
Extract country GeoJSONs from Natural Earth shapefile based on countries found in cache.
Extract admin1 (state/province) GeoJSONs from Natural Earth shapefile.

Args:
ne_file: Path to Natural Earth admin1 shapefile
output_dir: Output directory for GeoJSON files
countries: Optional list of 2-letter country codes to filter (e.g., ["US", "DE"])
If None, extracts all countries
"""
cache_path = Path(cache_dir)
ne_path = Path(ne_file)
output_path = Path(output_dir)

if not cache_path.exists():
print(f"❌ Cache directory not found: {cache_path}")
return

if not ne_path.exists():
print(f"❌ Natural Earth file not found: {ne_path}")
print(
"Download from: https://www.naturalearthdata.com/downloads/10m-cultural-vectors/"
)
return

output_path.mkdir(exist_ok=True, parents=True)

# Load Natural Earth shapefile
# Load Natural Earth admin1 shapefile
print(f"Loading {ne_path}...")
gdf = gpd.read_file(ne_path)
print(f"Loaded {len(gdf)} features")
print(f"Columns: {list(gdf.columns)}\n")

# Pattern: xx_yyyy_mm.json
pattern = re.compile(r"^([a-z]{2})_(\d{4})_(\d{1,2})\.json$", re.IGNORECASE)
# Key columns in admin1 shapefile:
# - iso_a2: 2-letter country code
# - iso_3166_2: Full ISO 3166-2 code (e.g., "US-CA")
# - name: Subdivision name
# - admin: Country name

# Get unique country codes from the data
available_countries = gdf["iso_a2"].dropna().unique()
print(f"Available countries in dataset: {len(available_countries)}")

# Find unique country codes in cache
country_codes = set()
for file_path in cache_path.glob("*.json"):
match = pattern.match(file_path.name)
if match:
country_codes.add(match.group(1).upper())
# Filter countries if specified
if countries:
country_codes = [c.upper() for c in countries]
else:
country_codes = sorted(available_countries)

print(f"Found {len(country_codes)} countries in cache\n")
print(f"Processing {len(country_codes)} countries\n")

# Extract each country from Natural Earth
# Extract subdivisions for each country
for code in sorted(country_codes):
if code == "-1" or code == "-99": # Skip invalid codes
continue

country = pycountry.countries.get(alpha_2=code)
if not country:
print(f"⚠ Unknown country code: {code}")
continue

# Filter by ISO_A2
country_gdf = gdf[gdf["ISO_A2"] == code]
# Filter by iso_a2
country_gdf = gdf[gdf["iso_a2"] == code].copy()

if len(country_gdf) > 0:
output_file = output_path / f"{country.alpha_3}.geojson"
# Save all subdivisions for this country in one file
output_file = output_path / f"{country.alpha_3}_admin1.geojson"
country_gdf.to_file(output_file, driver="GeoJSON")

size_kb = output_file.stat().st_size / 1024
print(f"✓ {country.name:30} → {output_file.name} ({size_kb:.1f} KB)")
subdivision_count = len(country_gdf)
print(
f"✓ {country.name:30} → {output_file.name} ({subdivision_count} subdivisions, {size_kb:.1f} KB)"
)
else:
print(f"⚠ {country.name} ({code}): No features found in NE data")
print(f"⚠ {country.name} ({code}): No subdivisions found in NE data")

print(f"\n✓ Done! Files saved to: {output_path.absolute()}")


def extract_admin1_by_subdivision(
ne_file: str = "ne_10m_admin_1_states_provinces.shp",
output_dir: str = "geojson_subdivisions",
countries: list[str] | None = None,
) -> None:
"""
Extract individual GeoJSON files for each subdivision.
Creates one file per subdivision (e.g., US-CA.geojson, US-NY.geojson).

Args:
ne_file: Path to Natural Earth admin1 shapefile
output_dir: Output directory for GeoJSON files
countries: Optional list of 2-letter country codes to filter
"""
ne_path = Path(ne_file)
output_path = Path(output_dir)

if not ne_path.exists():
print(f"❌ Natural Earth file not found: {ne_path}")
return

output_path.mkdir(exist_ok=True, parents=True)

print(f"Loading {ne_path}...")
gdf = gpd.read_file(ne_path)
print(f"Loaded {len(gdf)} features\n")

# Filter by countries if specified
if countries:
country_codes = [c.upper() for c in countries]
gdf = gdf[gdf["iso_a2"].isin(country_codes)]
print(f"Filtered to {len(gdf)} features for countries: {country_codes}\n")

# Extract each subdivision
count = 0
for _, row in gdf.iterrows():
iso_code = row.get("iso_3166_2")
if not iso_code or iso_code in ["-1", "-99"]:
continue

# Create a single-row GeoDataFrame
subdivision_gdf = gpd.GeoDataFrame([row], crs=gdf.crs)

# Clean filename (replace invalid chars)
safe_code = iso_code.replace("/", "-").replace("\\", "-")
output_file = output_path / f"{safe_code}.geojson"

subdivision_gdf.to_file(output_file, driver="GeoJSON")
count += 1

print(f"✓ Extracted {count} subdivision files to: {output_path.absolute()}")


def extract_cities_geojson(
ne_file: str = "ne_10m_populated_places_simple.shp",
output_file: str = "ne_cities.geojson",
) -> None:
"""
Extract cities from Natural Earth populated places shapefile to GeoJSON.

Args:
ne_file: Path to Natural Earth populated places shapefile
output_file: Output GeoJSON file path
"""
ne_path = Path(ne_file)
output_path = Path(output_file)

if not ne_path.exists():
print(f"❌ Natural Earth file not found: {ne_path}")
print(
"Download from: https://www.naturalearthdata.com/downloads/10m-cultural-vectors/"
)
return

output_path.parent.mkdir(exist_ok=True, parents=True)

print(f"Loading {ne_path}...")
gdf = gpd.read_file(ne_path)
print(f"Loaded {len(gdf)} cities")
print(f"Columns: {list(gdf.columns)}\n")

# Keep only essential columns to reduce file size
# Common columns: name, nameascii, latitude, longitude, iso_a2, adm1name, pop_max
keep_cols = [
"name",
"nameascii",
"latitude",
"longitude",
"iso_a2",
"adm1name",
"adm0name",
"pop_max",
"geometry",
]
available_cols = [c for c in keep_cols if c in gdf.columns]
gdf_slim = gdf[available_cols].copy()

# Save to GeoJSON
gdf_slim.to_file(output_path, driver="GeoJSON")

size_kb = output_path.stat().st_size / 1024
print(f"✓ Extracted {len(gdf_slim)} cities to: {output_path}")
print(f" Size: {size_kb:.1f} KB")
print(f" Columns: {available_cols}")


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(
description="Extract country GeoJSONs from Natural Earth data"
description="Extract admin1 GeoJSONs from Natural Earth data"
)
parser.add_argument(
"--cache-dir",
default="/data/cache/v0",
help="Cache directory with country files",
"--ne-file",
default="ne_10m_admin_1_states_provinces.shp",
help="Natural Earth admin1 shapefile (.shp)",
)
parser.add_argument(
"--ne-file",
default="ne_10m_admin_0_countries.shp",
help="Natural Earth shapefile (.shp)",
"--output-dir", default="geojson_admin1", help="Output directory"
)
parser.add_argument(
"--countries",
nargs="+",
help="Optional: specific country codes to extract (e.g., US DE FR)",
)
parser.add_argument(
"--output-dir", default="geojson_countries", help="Output directory"
"--by-subdivision",
action="store_true",
help="Create individual files per subdivision instead of per country",
)
parser.add_argument(
"--cities",
action="store_true",
help="Extract cities from populated places file instead of admin boundaries",
)
parser.add_argument(
"--cities-file",
default="ne_10m_populated_places_simple.shp",
help="Natural Earth populated places shapefile (used with --cities)",
)

args = parser.parse_args()
extract_geojsons(
cache_dir=args.cache_dir, ne_file=args.ne_file, output_dir=args.output_dir
)

if args.cities:
output_file = (
Path(args.output_dir).parent / "geojson_cities" / "ne_cities.geojson"
)
extract_cities_geojson(
ne_file=args.cities_file,
output_file=str(output_file),
)
elif args.by_subdivision:
extract_admin1_by_subdivision(
ne_file=args.ne_file,
output_dir=args.output_dir,
countries=args.countries,
)
else:
extract_admin1_geojsons(
ne_file=args.ne_file,
output_dir=args.output_dir,
countries=args.countries,
)
8 changes: 8 additions & 0 deletions prototype/natural_earth/geojson_admin1/ABW_admin1.geojson
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"type": "FeatureCollection",
"name": "ABW_admin1",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "featurecla": "Admin-1 states provinces", "scalerank": 10, "adm1_code": "ABW-5150", "diss_me": 5150, "iso_3166_2": "AW-X01~", "wikipedia": null, "iso_a2": "AW", "adm0_sr": 3, "name": "Aruba", "name_alt": null, "name_local": null, "type": null, "type_en": null, "code_local": null, "code_hasc": "AW.AA", "note": null, "hasc_maybe": null, "region": null, "region_cod": null, "provnum_ne": 0.0, "gadm_level": 0, "check_me": 0, "datarank": 10, "abbrev": null, "postal": null, "area_sqkm": 0.0, "sameascity": -99, "labelrank": 20, "name_len": 5, "mapcolor9": 2, "mapcolor13": 9, "fips": null, "fips_alt": null, "woe_id": 23424736.0, "woe_label": null, "woe_name": "Aruba", "latitude": 12.5147, "longitude": -69.9689, "sov_a3": "NL1", "adm0_a3": "ABW", "adm0_label": 6, "admin": "Aruba", "geonunit": "Aruba", "gu_a3": "ABW", "gn_id": -99.0, "gn_name": null, "gns_id": 0.0, "gns_name": null, "gn_level": 0.0, "gn_region": null, "gn_a1_code": "AW.", "region_sub": null, "sub_code": null, "gns_level": 0.0, "gns_lang": null, "gns_adm1": null, "gns_region": null, "min_label": 11.0, "max_label": 11.0, "min_zoom": 11.0, "wikidataid": "Q21203", "name_ar": "أروبا", "name_bn": "আরুবা", "name_de": "Aruba", "name_en": "Aruba", "name_es": "Aruba", "name_fr": "Aruba", "name_el": "Αρούμπα", "name_hi": "अरूबा", "name_hu": "Aruba", "name_id": "Aruba", "name_it": "Aruba", "name_ja": "アルバ", "name_ko": "아루바", "name_nl": "Aruba", "name_pl": "Aruba", "name_pt": "Aruba", "name_ru": "Аруба", "name_sv": "Aruba", "name_tr": "Aruba", "name_vi": "Aruba", "name_zh": "阿鲁巴", "ne_id": 1159315531, "name_he": "ארובה", "name_uk": "Аруба", "name_ur": "اروبا", "name_fa": "آروبا", "name_zht": "阿魯巴", "FCLASS_ISO": null, "FCLASS_US": null, "FCLASS_FR": null, "FCLASS_RU": null, "FCLASS_ES": null, "FCLASS_CN": null, "FCLASS_TW": null, "FCLASS_IN": null, "FCLASS_NP": null, "FCLASS_PK": null, "FCLASS_DE": null, "FCLASS_GB": null, "FCLASS_BR": null, "FCLASS_IL": null, "FCLASS_PS": null, "FCLASS_SA": null, "FCLASS_EG": null, "FCLASS_MA": null, "FCLASS_PT": null, "FCLASS_AR": null, "FCLASS_JP": null, "FCLASS_KO": null, "FCLASS_VN": null, "FCLASS_TR": null, "FCLASS_ID": null, "FCLASS_PL": null, "FCLASS_GR": null, "FCLASS_IT": null, "FCLASS_NL": null, "FCLASS_SE": null, "FCLASS_BD": null, "FCLASS_UA": null, "FCLASS_TLC": null }, "geometry": { "type": "Polygon", "coordinates": [ [ [ -69.996937628999945, 12.57758209800005 ], [ -69.936390753999945, 12.531724351000037 ], [ -69.924672003999945, 12.519232489000046 ], [ -69.915760870999918, 12.497015692000048 ], [ -69.880197719999899, 12.453558661000045 ], [ -69.876820441999939, 12.427394924000055 ], [ -69.888091600999928, 12.417669989000046 ], [ -69.908802863999938, 12.417792059000078 ], [ -69.930531378999945, 12.425970770000049 ], [ -69.945139126999948, 12.440375067000048 ], [ -69.924672003999945, 12.440375067000048 ], [ -69.924672003999945, 12.447211005000042 ], [ -69.958566860999952, 12.46320221600007 ], [ -70.02765865799995, 12.522935289000088 ], [ -70.048085089999915, 12.531154690000051 ], [ -70.058094855999911, 12.537176825000074 ], [ -70.062408006999931, 12.546820380000042 ], [ -70.060373501999948, 12.55695221600007 ], [ -70.05109615799995, 12.574042059000078 ], [ -70.048736131999931, 12.583726304000038 ], [ -70.052642381999931, 12.600002346000053 ], [ -70.059641079999949, 12.614243882000039 ], [ -70.061105923999946, 12.625392971000053 ], [ -70.048736131999931, 12.632147528000075 ], [ -70.007150844999899, 12.585516669000071 ], [ -69.996937628999945, 12.57758209800005 ] ] ] } }
]
}
41 changes: 41 additions & 0 deletions prototype/natural_earth/geojson_admin1/AFG_admin1.geojson

Large diffs are not rendered by default.

Loading
Loading