From e26df7907ef60f96655550a01f044120b93ad99d Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 4 Nov 2023 00:10:26 +0100 Subject: [PATCH 1/2] Some performance quick wins for the geopandas implementation --- open_buildings/google/process.py | 15 ++++++++------- requirements.txt | 1 + 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/open_buildings/google/process.py b/open_buildings/google/process.py index cf82333..d3a998c 100644 --- a/open_buildings/google/process.py +++ b/open_buildings/google/process.py @@ -210,14 +210,13 @@ def process_with_pandas( input_file_path, split_multipolygons, verbose, format, output_file_path ): df = pd.read_csv(input_file_path) - df['geometry'] = df['geometry'].apply(wkt.loads) + gs = gpd.GeoSeries.from_wkt(df['geometry']) - # Drop the 'latitude' and 'longitude' columns - df = df.drop(['latitude', 'longitude'], axis=1) + # Drop the 'latitude', 'longitude' and 'geometry' columns + df = df.drop(['latitude', 'longitude', 'geometry'], axis=1) # Convert the DataFrame to a GeoDataFrame - gdf = gpd.GeoDataFrame(df, geometry='geometry') - gdf.set_crs("EPSG:4326", inplace=True) + gdf = gpd.GeoDataFrame(df, geometry=gs, crs="EPSG:4326") # Create an empty GeoDataFrame for the output output_gdf = gpd.GeoDataFrame(columns=list(gdf.columns), crs=gdf.crs) @@ -295,11 +294,13 @@ def process_with_pandas( ) # Write the output GeoDataFrame to a file if format == 'fgb': - output_gdf.to_file(output_file_path, driver="FlatGeobuf") + output_gdf.to_file(output_file_path, driver="FlatGeobuf", engine="pyogrio") elif format == 'parquet': output_gdf.to_parquet(output_file_path, compression=PARQUET_COMPRESSION) elif format == 'gpkg': - output_gdf.to_file(output_file_path, driver='GPKG') + output_gdf.to_file( + output_file_path, driver='GPKG', engine="pyogrio", spatial_index=False + ) elif format == 'shp': output_gdf.to_file(output_file_path, driver='ESRI Shapefile') diff --git a/requirements.txt b/requirements.txt index 56110fd..520f6c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ click duckdb pandas geopandas +pyogrio shapely openlocationcode tabulate From d08dc002305e6d91e6e0c89ae4398006d0c62383 Mon Sep 17 00:00:00 2001 From: Pieter Roggemans Date: Sat, 4 Nov 2023 00:10:41 +0100 Subject: [PATCH 2/2] Update process.py --- open_buildings/google/process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_buildings/google/process.py b/open_buildings/google/process.py index d3a998c..64d5f40 100644 --- a/open_buildings/google/process.py +++ b/open_buildings/google/process.py @@ -302,7 +302,7 @@ def process_with_pandas( output_file_path, driver='GPKG', engine="pyogrio", spatial_index=False ) elif format == 'shp': - output_gdf.to_file(output_file_path, driver='ESRI Shapefile') + output_gdf.to_file(output_file_path, driver='ESRI Shapefile', engine="pyogrio") def process_with_ogr2ogr(