diff --git a/open_buildings/overture/add_columns.py b/open_buildings/overture/add_columns.py index a1451f5..b68f467 100644 --- a/open_buildings/overture/add_columns.py +++ b/open_buildings/overture/add_columns.py @@ -4,15 +4,17 @@ # parquet to geoparquet. +import glob import os -import duckdb -import time -import tempfile +import shutil import subprocess -import glob -from duckdb.typing import * +import tempfile +import time + +import duckdb import mercantile -import shutil +from duckdb.typing import * + def lat_lon_to_quadkey(lat: DOUBLE, lon: DOUBLE, level: INTEGER) -> VARCHAR: # Convert latitude and longitude to tile using mercantile @@ -44,6 +46,7 @@ def add_quadkey(con): ); """) + def add_country_iso(con, country_parquet_path): # Load country parquet file into duckdb con.execute(f"CREATE TABLE countries AS SELECT * FROM read_parquet('{country_parquet_path}')") @@ -88,8 +91,8 @@ def process_parquet_file(input_parquet_path, output_folder, country_parquet_path con.execute('LOAD spatial;') - # Load parquet file into duckdb - con.execute(f"CREATE TABLE buildings AS SELECT * FROM read_parquet('{input_parquet_path}')") + # NOTE: exclude names column because it's all NULL and causes InternalException: INTERNAL Error: Attempted to dereference unique_ptr that is NULL! + con.execute(f"CREATE OR REPLACE TABLE buildings AS SELECT * EXCLUDE(names) FROM read_parquet('{input_parquet_path}')") if add_quadkey_option: add_quadkey(con) @@ -126,7 +129,14 @@ def process_parquet_files(input_path, output_folder, country_parquet_path, overw process_parquet_file(input_path, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option, verbose) # Call the function - uncomment if you want to call this directly from python and put values in here. +import pathlib + +release_version = "overture_02-15" # Example version, adjust as necessary +data_dir = pathlib.Path.home() / "data" / "src" / f"{release_version}" / "theme=buildings" / "type=building" +out_dir = pathlib.Path.home() / "data" / "prc" / f"{release_version}" / "theme=buildings" / "type=building" + +input_path = data_dir / "part-00041-a34b09ea-399f-4872-b0b1-084a81bbb42f-c000.zstd.parquet" #input_path = '/Volumes/fastdata/overture/s3-data/buildings/' #output_folder = '/Volumes/fastdata/overture/refined-parquet/' #country_parquet_path = '/Volumes/fastdata/overture/countries.parquet' -#process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=True, add_country_iso_option=True) \ No newline at end of file +process_parquet_files(input_path, out_dir, "", overwrite=False, add_quadkey_option=True, add_country_iso_option=False) \ No newline at end of file diff --git a/open_buildings/overture/extract_admin_bounds.py b/open_buildings/overture/extract_admin_bounds.py new file mode 100644 index 0000000..42bd60d --- /dev/null +++ b/open_buildings/overture/extract_admin_bounds.py @@ -0,0 +1,93 @@ +import argparse +import pathlib + +import duckdb +import geopandas as gpd +from shapely import wkb + + +def main(admin_level, overture_dir, out_dir): + con = duckdb.connect(database=":memory:", read_only=False) + con.execute("INSTALL spatial;") + con.execute("LOAD spatial;") + + out_dir = pathlib.Path(out_dir).expanduser() + if not out_dir.exists(): + out_dir.mkdir(parents=True, exist_ok=True) + + # Common part of the query + common_query = f""" + WITH admins_view AS ( + SELECT * FROM read_parquet('{str(overture_dir)}/theme=admins/type=*/*') + ) + SELECT + admins.id, + admins.isoCountryCodeAlpha2, + admins.names, + """ + + # Conditional part of the query based on admin_level + if admin_level == 2: + admin_level_query = "admins.isoSubCountryCode," + else: + admin_level_query = "" + + # Final part of the query + final_query = f""" + areas.areaGeometry as geometry + FROM admins_view AS admins + INNER JOIN ( + SELECT + id as areaId, + localityId, + geometry AS areaGeometry + FROM admins_view + ) AS areas ON areas.localityId = admins.id + WHERE admins.adminLevel = {admin_level}; + """ + + # Combine all parts to form the final query + query = common_query + admin_level_query + final_query + + # Execute the query + admins = con.execute(query).fetchdf() + + # Convert to GeoDataFrame and process geometry + admins = gpd.GeoDataFrame( + admins, + geometry=admins["geometry"].apply(lambda b: wkb.loads(bytes(b))), + crs="EPSG:4326", + ) + + # Process names and drop the original column + admins["primary_name"] = admins.names.map(lambda r: r["primary"]) + admins = admins.drop(columns=["names"]) + + # Write the output + outpath = out_dir / f"admin_bounds_level_{admin_level}.parquet" + print(f"Writing admin boundaries level {admin_level} to: {outpath}") + admins.to_parquet(outpath) + + +if __name__ == "__main__": + + # NOTE: href or dir? + # OVERTURE_HREF = str(pathlib.Path("~/data/src/overture/2024-02-15-alpha.0").expanduser()) + OVERTURE_HREF = "s3://overturemaps-us-west-2/release/2024-02-15-alpha.0" + OUT_DIR = pathlib.Path("~/data/prc/overture/2024-02-15").expanduser() + + parser = argparse.ArgumentParser(description="Process Overture admin level data.") + parser.add_argument("admin_level", type=int, help="The admin level to process") + parser.add_argument( + "-s", + "--source", + default=f"{OVERTURE_HREF}", + help="The source Overture directory", + ) + parser.add_argument( + "-o", "--output", default=f"{OUT_DIR}", help="The output directory" + ) + + args = parser.parse_args() + + main(args.admin_level, args.source, args.output)