Skip to content

Commit

Permalink
Merge pull request #236 from tm-jc-nacpil/feat/exactextract_zonal_stats
Browse files Browse the repository at this point in the history
[WIP] Exactextract zonal stats implementation
  • Loading branch information
butchtm authored Jul 12, 2024
2 parents 8d3c6f3 + 6eb1b3e commit 93c8e12
Show file tree
Hide file tree
Showing 6 changed files with 1,511 additions and 78 deletions.
Binary file added data/ph_s5p_AER_AI_340_380.tiff
Binary file not shown.
7 changes: 6 additions & 1 deletion geowrangler/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,15 @@
'geowrangler/raster_to_dataframe.py'),
'geowrangler.raster_to_dataframe.read_bands': ( 'raster_to_dataframe.html#read_bands',
'geowrangler/raster_to_dataframe.py')},
'geowrangler.raster_zonal_stats': { 'geowrangler.raster_zonal_stats.create_raster_zonal_stats': ( 'raster_zonal_stats.html#create_raster_zonal_stats',
'geowrangler.raster_zonal_stats': { 'geowrangler.raster_zonal_stats._validate_aggs': ( 'raster_zonal_stats.html#_validate_aggs',
'geowrangler/raster_zonal_stats.py'),
'geowrangler.raster_zonal_stats.create_exactextract_zonal_stats': ( 'raster_zonal_stats.html#create_exactextract_zonal_stats',
'geowrangler/raster_zonal_stats.py'),
'geowrangler.raster_zonal_stats.create_raster_zonal_stats': ( 'raster_zonal_stats.html#create_raster_zonal_stats',
'geowrangler/raster_zonal_stats.py')},
'geowrangler.spatialjoin_highest_intersection': { 'geowrangler.spatialjoin_highest_intersection.get_highest_intersection': ( 'spatialjoin_highest_intersection.html#get_highest_intersection',
'geowrangler/spatialjoin_highest_intersection.py')},
'geowrangler.test_module': {},
'geowrangler.tile_clustering': { 'geowrangler.tile_clustering.TileClustering': ( 'tile_clustering.html#tileclustering',
'geowrangler/tile_clustering.py'),
'geowrangler.tile_clustering.TileClustering.__init__': ( 'tile_clustering.html#tileclustering.__init__',
Expand Down
176 changes: 174 additions & 2 deletions geowrangler/raster_zonal_stats.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: ../notebooks/03_raster_zonal_stats.ipynb.

# %% auto 0
__all__ = ['create_raster_zonal_stats']
__all__ = ['create_raster_zonal_stats', 'create_exactextract_zonal_stats']

# %% ../notebooks/03_raster_zonal_stats.ipynb 8
from pathlib import Path
from typing import Any, Dict, Union
from typing import Any, Dict, Union, List
import warnings
import json

import geopandas as gpd
import pandas as pd
import rasterio
import rasterstats as rs
from exactextract import exact_extract
from exactextract.raster import RasterioRasterSource

from .vector_zonal_stats import _expand_aggs, _fillnas, _fix_agg

Expand Down Expand Up @@ -98,3 +102,171 @@ def create_raster_zonal_stats(
aoi = aoi.merge(results, how="left", left_index=True, right_index=True)

return aoi

# %% ../notebooks/03_raster_zonal_stats.ipynb 30
def _validate_aggs(aggregation, band_count):
"Validate aggregations based on band count, dropping invalid entries"
aggregation_validated = []

# Iterate over the list of dictionaries
for agg in aggregation:
# Check if the 'band' value is less than or equal to band_count
if agg["band"] > band_count:
warnings.warn(
f"Band number in {json.dumps(agg)} is invalid. Aggregation will be skipped."
)
continue
aggregation_validated.append(agg)

return aggregation_validated

# %% ../notebooks/03_raster_zonal_stats.ipynb 31
def create_exactextract_zonal_stats(
aoi: Union[
str, gpd.GeoDataFrame
], # The area of interest geodataframe, or path to the vector file
data: Union[str, Path], # The path to the raster data file
aggregation: List[Dict[str, Any]], # List of agg specs,
include_cols: List[
str
] = None, # If not None, list of columns from input AOI to include in output
include_geom: bool = True, # If false, drop geometry column. include_cols takes priority.
extra_args: dict = dict(
strategy="feature-sequential", max_cells_in_memory=30000000
), # Extra arguments to pass to `exactextract.exact_extract(). Ignores output, include_geom, and include_cols.
) -> gpd.GeoDataFrame:
"""
Computes zonal statistics from raster data sources using vector areas of interest (AOI).
This function is a wrapper over the `exact_extract` method from the `exactextract` Python package,
designed for compatibility with other geowrangler modules. It takes a list of agg specs,
with each agg spec applied to a specific band.
See https://github.com/isciences/exactextract/blob/master/python/README.md for more details.
Parameters
----------
aoi : GeoDataframe
A GeoDataframe specifying geometries.
data : str
Path to the raster file.
aggregation: list[Dict]
list of aggregation specs, which are dictionaries which specify the ff.
- band (int): band number of the input raster to aggregate
- func (list[str]): list of operations to perform (i.e. "mean", "median", "count"). For a full list of
possible exactextract functions, see: https://github.com/isciences/exactextract/tree/master?tab=readme-ov-file#supported-statistics
- output (str or list[str], option): output columns for operation.
If None (default), the output column name will be given as band_<band number>_<func> for all functions.
If str, the output column name will be given as <output>_<func> for all functions.
If list(str), the list items will be used as the column names, in order of the passed funcs.
- nodata_val (int or float, optional): nodata value to use in calculation.
include_cols: list[str]
List of columns from AOI to include in output. If none (default), all columns are included
include_geom:
If False, drop geometry column. If used together with include_cols, include_cols takes priority.
extra_args : dict
Extra arguments to pass to exactextract.exact_extract(). "include_cols", "include_geom", and "output" arguments are ignored.
Example usage
--------
create_exactextract_zonal_stats(
aoi=aoi_gdf,
data="path/to/raster.tif",
aggregation=[
dict(band=1, func=["mean", "sum"], nodata_val=-9999) # default - band_1_mean, band_1_sum
dict(band=1, func=["mean", "sum"], output=["pop_mean", "pop_sum"]),
dict(band=2, func=["mean", "sum"]) # default - band_2_mean, band_2_sum
dict(band=2, func=["mean", "sum"], output="prefix") # prefix_mean, prefix_sum
],
)
"""
# TODO: implement NODATA handling after exactextract is updated in pypi

# Handle extra arguments to exactextract
if "weights" in extra_args:
warnings.warn(
"Input weights raster will be used for all passed aggregations. To use different weights for each aggregation, call `get_exactextract_zonal_stats` multiple times."
)
if "include_geom" in extra_args:
extra_args.pop("include_geom")
warnings.warn(
"include_geom in `extra_args` is ignored. Use `include_geom` in the function parameter instead."
)
if "include_cols" in extra_args:
extra_args.pop("include_cols")
warnings.warn(
"include_cols in `extra_args` is ignored. Use `include_cols` in the function parameter instead."
)
if "output" in extra_args:
extra_args.pop("output")
warnings.warn(
"output in `extra_args` is ignored. Output is set to 'pandas'. Refer to `exactextract.exact_extract()` to use other output options."
)

# Open
if isinstance(aoi, str) or isinstance(aoi, Path):
aoi = gpd.read_file(aoi)

# Open raster and run exactextract
with rasterio.open(data) as dst:
# Validate passed aggregations based on band count and compile
aggregation = _validate_aggs(aggregation, dst.count)
all_operations = set()
for agg in aggregation:
all_operations.update(agg["func"])
all_operations = sorted(all_operations)

# Check CRS alignment
if aoi.crs != dst.crs:
warnings.warn(
f"The CRS of the AOI ({aoi.crs}) and the raster data ({dst.crs}) do not match!"
)

# Run exactextract
results = exact_extract(
data, aoi, all_operations, output="pandas", **extra_args
)
# If input is single band, the output is processed to band_1_<func> for all funcs
# This matches default output in multiband case
if dst.count == 1:
results = results.rename(
columns={col: f"band_1_{col}" for col in results.columns}
)

# Create new columns as specified by agg specs
# Each renamed column will be a copy of its corresponding result column
out_cols = []
for agg in aggregation:
result_cols = [f"band_{agg['band']}_{func}" for func in agg["func"]]

if "output" not in agg:
agg_out_cols = result_cols

elif isinstance(agg["output"], str):
agg_out_cols = [f"{agg['output']}_{func}" for func in agg["func"]]

elif isinstance(agg["output"], list) and all(
isinstance(item, str) for item in agg.get("output")
):
assert len(agg["output"]) == len(
agg["func"]
), f"Output list only has {len(agg['output'])}, expected {len(agg['func'])}"
agg_out_cols = agg.get("output")

# Add output columns to result
for agg_out_col, result_col in zip(agg_out_cols, result_cols):
results[agg_out_col] = results[result_col]
out_cols.extend(agg_out_cols)
output_results = results.loc[:, out_cols]

# Apply include_cols and include_geom
if include_cols is not None:
aoi = aoi.loc[:, include_cols]
try:
active_geom_col = aoi.active_geometry_name
except:
active_geom_col = None
if (not include_geom) and (active_geom_col is not None):
aoi = aoi.drop(active_geom_col, axis=1)

return aoi.merge(output_results, how="left", left_index=True, right_index=True)
1,247 changes: 1,173 additions & 74 deletions notebooks/03_raster_zonal_stats.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion settings.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
language = English
status = 3
user = thinkingmachines
requirements = fastcore pandas numpy geopandas fastprogress h3 morecantile loguru rasterstats scikit-learn requests pyarrow
requirements = fastcore pandas numpy geopandas fastprogress h3 morecantile loguru rasterstats scikit-learn requests pyarrow exactextract
dev_requirements = nbdev jupyterlab matplotlib nbdime ipytest branca folium mapclassify pytest pytest-mock pytest-cov pytest-xdist black[jupyter]
readme_nb = readme.ipynb
allowed_metadata_keys =
Expand Down
157 changes: 157 additions & 0 deletions tests/test_raster_zonal_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,160 @@ def test_raster_zonal_stats_nodata_default():
),
)
assert results["population_count"].iloc[0] > 0

# exactextract tests
def test_create_exactextract_zonal_stats(simple_aoi):
terrain_file = "data/sample_terrain.tif"
results = rzs.create_exactextract_zonal_stats(
simple_aoi,
terrain_file,
aggregation=[
dict(band=1, func=["mean", "max", "min", "stdev"], output="elevation"),
dict(band=2, func=["mean", "max", "min", "stdev"], output="elevation")
],
)
assert list(results.columns.values) == [
"col1",
"lat0",
"lon0",
"lat1",
"lon1",
"lat2",
"lon2",
"lat3",
"lon3",
"geometry",
"elevation_mean",
"elevation_max",
"elevation_min",
"elevation_stdev",
]

def test_create_exactextract_zonal_stats_from_file():
terrain_file = "data/sample_terrain.tif"
file_aoi = "data/simple_aoi.geojson"
results = rzs.create_exactextract_zonal_stats(
file_aoi,
terrain_file,
aggregation=[dict(band=1, func=["mean", "min", "max", "stdev"], output="elevation")],
)
print(results)
assert list(results.columns.values) == [
"col1",
"lat0",
"lon0",
"lat1",
"lon1",
"lat2",
"lon2",
"lat3",
"lon3",
"geometry",
"elevation_mean",
"elevation_min",
"elevation_max",
"elevation_stdev",
]

def test_exactextract_zonal_stats_multiband():
aq_file = "data/ph_s5p_AER_AI_340_380.tiff"
file_aoi = "data/region3_admin.geojson"
file_aoi = gpd.read_file(file_aoi).to_crs("EPSG:3857")
results = rzs.create_exactextract_zonal_stats(
file_aoi,
aq_file,
aggregation=[
dict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
],
)
assert list(results.columns.values) == [
"Reg_Code",
"Reg_Name",
"Reg_Alt_Name",
"geometry",
"band_1_mean",
"band_1_sum",
"band_2_mean",
"band_2_sum",
"prefix_mean",
"prefix_sum",
"aer_ai_mean",
"aer_ai_sum",
"aer_ai_count"
]

def test_exactextract_zonal_stats_multiband_crs_mismatch():
aq_file = "data/ph_s5p_AER_AI_340_380.tiff"
file_aoi = "data/region3_admin.geojson"
file_aoi = gpd.read_file(file_aoi).to_crs("EPSG:4326")
results = rzs.create_exactextract_zonal_stats(
file_aoi,
aq_file,
aggregation=[
dict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
],
)
assert list(results.columns.values) == [
"Reg_Code",
"Reg_Name",
"Reg_Alt_Name",
"geometry",
"band_1_mean",
"band_1_sum",
"band_2_mean",
"band_2_sum",
"prefix_mean",
"prefix_sum",
"aer_ai_mean",
"aer_ai_sum",
"aer_ai_count"
]

def test_exactextract_geojson(simple_aoi):
"Check if output returns pandas output despite output option"
terrain_file = "data/sample_terrain.tif"
results = rzs.create_exactextract_zonal_stats(
simple_aoi,
terrain_file,
aggregation=[
dict(band=1, func=["sum"], output="elevation"),
],
extra_args = dict(output="geojson")
)
assert isinstance(results, (pd.DataFrame, gpd.GeoDataFrame))

def test_exactextract_include_cols(simple_aoi):
"Check if output returns specified columns"
terrain_file = "data/sample_terrain.tif"
results = rzs.create_exactextract_zonal_stats(
simple_aoi,
terrain_file,
aggregation=[
dict(band=1, func=["sum"], output="elevation"),
],
include_cols=["col1", "lat0"]
)
assert list(results.columns.values) == [
"col1",
"lat0",
"elevation_sum",
]

def test_exactextract_include_geom(simple_aoi):
"Check if output returns pandas output despite output option"
terrain_file = "data/sample_terrain.tif"
results = rzs.create_exactextract_zonal_stats(
simple_aoi,
terrain_file,
aggregation=[
dict(band=1, func=["sum"], output="elevation"),
],
extra_args = dict(include_geom=True)
)
assert isinstance(results, (pd.DataFrame, gpd.GeoDataFrame))

0 comments on commit 93c8e12

Please sign in to comment.