Merge pull request #236 from tm-jc-nacpil/feat/exactextract_zonal_stats

[WIP] Exactextract zonal stats implementation
thinkingmachines · Jul 12, 2024 · 93c8e12 · 93c8e12
2 parents 8d3c6f3 + 6eb1b3e
commit 93c8e12
Show file tree

Hide file tree

Showing 6 changed files with 1,511 additions and 78 deletions.
diff --git a/data/ph_s5p_AER_AI_340_380.tiff b/data/ph_s5p_AER_AI_340_380.tiff
diff --git a/geowrangler/_modidx.py b/geowrangler/_modidx.py
@@ -157,10 +157,15 @@
                                                                                                     'geowrangler/raster_to_dataframe.py'),
                                                  'geowrangler.raster_to_dataframe.read_bands': ( 'raster_to_dataframe.html#read_bands',
                                                                                                  'geowrangler/raster_to_dataframe.py')},
-            'geowrangler.raster_zonal_stats': { 'geowrangler.raster_zonal_stats.create_raster_zonal_stats': ( 'raster_zonal_stats.html#create_raster_zonal_stats',
+            'geowrangler.raster_zonal_stats': { 'geowrangler.raster_zonal_stats._validate_aggs': ( 'raster_zonal_stats.html#_validate_aggs',
+                                                                                                   'geowrangler/raster_zonal_stats.py'),
+                                                'geowrangler.raster_zonal_stats.create_exactextract_zonal_stats': ( 'raster_zonal_stats.html#create_exactextract_zonal_stats',
+                                                                                                                    'geowrangler/raster_zonal_stats.py'),
+                                                'geowrangler.raster_zonal_stats.create_raster_zonal_stats': ( 'raster_zonal_stats.html#create_raster_zonal_stats',
                                                                                                               'geowrangler/raster_zonal_stats.py')},
             'geowrangler.spatialjoin_highest_intersection': { 'geowrangler.spatialjoin_highest_intersection.get_highest_intersection': ( 'spatialjoin_highest_intersection.html#get_highest_intersection',
                                                                                                                                          'geowrangler/spatialjoin_highest_intersection.py')},
+            'geowrangler.test_module': {},
             'geowrangler.tile_clustering': { 'geowrangler.tile_clustering.TileClustering': ( 'tile_clustering.html#tileclustering',
                                                                                              'geowrangler/tile_clustering.py'),
                                              'geowrangler.tile_clustering.TileClustering.__init__': ( 'tile_clustering.html#tileclustering.__init__',

diff --git a/geowrangler/raster_zonal_stats.py b/geowrangler/raster_zonal_stats.py
@@ -1,16 +1,20 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../notebooks/03_raster_zonal_stats.ipynb.
 
 # %% auto 0
-__all__ = ['create_raster_zonal_stats']
+__all__ = ['create_raster_zonal_stats', 'create_exactextract_zonal_stats']
 
 # %% ../notebooks/03_raster_zonal_stats.ipynb 8
 from pathlib import Path
-from typing import Any, Dict, Union
+from typing import Any, Dict, Union, List
+import warnings
+import json
 
 import geopandas as gpd
 import pandas as pd
 import rasterio
 import rasterstats as rs
+from exactextract import exact_extract
+from exactextract.raster import RasterioRasterSource
 
 from .vector_zonal_stats import _expand_aggs, _fillnas, _fix_agg
 
@@ -98,3 +102,171 @@ def create_raster_zonal_stats(
     aoi = aoi.merge(results, how="left", left_index=True, right_index=True)
 
     return aoi
+
+# %% ../notebooks/03_raster_zonal_stats.ipynb 30
+def _validate_aggs(aggregation, band_count):
+    "Validate aggregations based on band count, dropping invalid entries"
+    aggregation_validated = []
+
+    # Iterate over the list of dictionaries
+    for agg in aggregation:
+        # Check if the 'band' value is less than or equal to band_count
+        if agg["band"] > band_count:
+            warnings.warn(
+                f"Band number in {json.dumps(agg)}  is invalid. Aggregation will be skipped."
+            )
+            continue
+        aggregation_validated.append(agg)
+
+    return aggregation_validated
+
+# %% ../notebooks/03_raster_zonal_stats.ipynb 31
+def create_exactextract_zonal_stats(
+    aoi: Union[
+        str, gpd.GeoDataFrame
+    ],  # The area of interest geodataframe, or path to the vector file
+    data: Union[str, Path],  # The path to the raster data file
+    aggregation: List[Dict[str, Any]],  # List of agg specs,
+    include_cols: List[
+        str
+    ] = None,  # If not None, list of columns from input AOI to include in output
+    include_geom: bool = True,  # If false, drop geometry column. include_cols takes priority.
+    extra_args: dict = dict(
+        strategy="feature-sequential", max_cells_in_memory=30000000
+    ),  # Extra arguments to pass to `exactextract.exact_extract(). Ignores output, include_geom, and include_cols.
+) -> gpd.GeoDataFrame:
+    """
+    Computes zonal statistics from raster data sources using vector areas of interest (AOI).
+
+    This function is a wrapper over the `exact_extract` method from the `exactextract` Python package,
+    designed for compatibility with other geowrangler modules. It takes a list of agg specs,
+    with each agg spec applied to a specific band.
+    See https://github.com/isciences/exactextract/blob/master/python/README.md for more details.
+
+    Parameters
+    ----------
+    aoi : GeoDataframe
+        A GeoDataframe specifying geometries.
+    data : str
+        Path to the raster file.
+    aggregation: list[Dict]
+        list of aggregation specs, which are dictionaries which specify the ff.
+        - band (int): band number of the input raster to aggregate
+        - func (list[str]): list of operations to perform (i.e. "mean", "median", "count"). For a full list of
+        possible exactextract functions, see: https://github.com/isciences/exactextract/tree/master?tab=readme-ov-file#supported-statistics
+        - output (str or list[str], option): output columns for operation.
+            If None (default), the output column name will be given as band_<band number>_<func> for all functions.
+            If str, the output column name will be given as <output>_<func> for all functions.
+            If list(str), the list items will be used as the column names, in order of the passed funcs.
+        - nodata_val (int or float, optional): nodata value to use in calculation.
+    include_cols: list[str]
+        List of columns from AOI to include in output. If none (default), all columns are included
+    include_geom:
+        If False, drop geometry column. If used together with include_cols, include_cols takes priority.
+    extra_args : dict
+        Extra arguments to pass to exactextract.exact_extract(). "include_cols", "include_geom", and "output" arguments are ignored.
+
+
+    Example usage
+    --------
+    create_exactextract_zonal_stats(
+        aoi=aoi_gdf,
+        data="path/to/raster.tif",
+        aggregation=[
+            dict(band=1, func=["mean", "sum"], nodata_val=-9999) # default - band_1_mean, band_1_sum
+            dict(band=1, func=["mean", "sum"], output=["pop_mean", "pop_sum"]),
+            dict(band=2, func=["mean", "sum"]) # default - band_2_mean, band_2_sum
+            dict(band=2, func=["mean", "sum"], output="prefix") # prefix_mean, prefix_sum
+        ],
+    )
+    """
+    # TODO: implement NODATA handling after exactextract is updated in pypi
+
+    # Handle extra arguments to exactextract
+    if "weights" in extra_args:
+        warnings.warn(
+            "Input weights raster will be used for all passed aggregations. To use different weights for each aggregation, call `get_exactextract_zonal_stats` multiple times."
+        )
+    if "include_geom" in extra_args:
+        extra_args.pop("include_geom")
+        warnings.warn(
+            "include_geom in `extra_args` is ignored. Use `include_geom` in the function parameter instead."
+        )
+    if "include_cols" in extra_args:
+        extra_args.pop("include_cols")
+        warnings.warn(
+            "include_cols in `extra_args` is ignored. Use `include_cols` in the function parameter instead."
+        )
+    if "output" in extra_args:
+        extra_args.pop("output")
+        warnings.warn(
+            "output in `extra_args` is ignored. Output is set to 'pandas'. Refer to `exactextract.exact_extract()` to use other output options."
+        )
+
+    # Open
+    if isinstance(aoi, str) or isinstance(aoi, Path):
+        aoi = gpd.read_file(aoi)
+
+    # Open raster and run exactextract
+    with rasterio.open(data) as dst:
+        # Validate passed aggregations based on band count and compile
+        aggregation = _validate_aggs(aggregation, dst.count)
+        all_operations = set()
+        for agg in aggregation:
+            all_operations.update(agg["func"])
+        all_operations = sorted(all_operations)
+
+        # Check CRS alignment
+        if aoi.crs != dst.crs:
+            warnings.warn(
+                f"The CRS of the AOI ({aoi.crs}) and the raster data ({dst.crs}) do not match!"
+            )
+
+        # Run exactextract
+        results = exact_extract(
+            data, aoi, all_operations, output="pandas", **extra_args
+        )
+        # If input is single band, the output is processed to band_1_<func> for all funcs
+        # This matches default output in multiband case
+        if dst.count == 1:
+            results = results.rename(
+                columns={col: f"band_1_{col}" for col in results.columns}
+            )
+
+    # Create new columns as specified by agg specs
+    # Each renamed column will be a copy of its corresponding result column
+    out_cols = []
+    for agg in aggregation:
+        result_cols = [f"band_{agg['band']}_{func}" for func in agg["func"]]
+
+        if "output" not in agg:
+            agg_out_cols = result_cols
+
+        elif isinstance(agg["output"], str):
+            agg_out_cols = [f"{agg['output']}_{func}" for func in agg["func"]]
+
+        elif isinstance(agg["output"], list) and all(
+            isinstance(item, str) for item in agg.get("output")
+        ):
+            assert len(agg["output"]) == len(
+                agg["func"]
+            ), f"Output list only has {len(agg['output'])}, expected {len(agg['func'])}"
+            agg_out_cols = agg.get("output")
+
+        # Add output columns to result
+        for agg_out_col, result_col in zip(agg_out_cols, result_cols):
+            results[agg_out_col] = results[result_col]
+        out_cols.extend(agg_out_cols)
+    output_results = results.loc[:, out_cols]
+
+    # Apply include_cols and include_geom
+    if include_cols is not None:
+        aoi = aoi.loc[:, include_cols]
+    try:
+        active_geom_col = aoi.active_geometry_name
+    except:
+        active_geom_col = None
+    if (not include_geom) and (active_geom_col is not None):
+        aoi = aoi.drop(active_geom_col, axis=1)
+
+    return aoi.merge(output_results, how="left", left_index=True, right_index=True)
diff --git a/notebooks/03_raster_zonal_stats.ipynb b/notebooks/03_raster_zonal_stats.ipynb
diff --git a/settings.ini b/settings.ini
@@ -27,7 +27,7 @@ keywords = nbdev jupyter notebook python
 language = English
 status = 3
 user = thinkingmachines
-requirements = fastcore pandas numpy geopandas fastprogress h3 morecantile loguru rasterstats scikit-learn requests pyarrow
+requirements = fastcore pandas numpy geopandas fastprogress h3 morecantile loguru rasterstats scikit-learn requests pyarrow exactextract
 dev_requirements = nbdev jupyterlab matplotlib nbdime ipytest branca folium mapclassify pytest pytest-mock pytest-cov pytest-xdist black[jupyter]
 readme_nb = readme.ipynb
 allowed_metadata_keys = 

diff --git a/tests/test_raster_zonal_stats.py b/tests/test_raster_zonal_stats.py
@@ -131,3 +131,160 @@ def test_raster_zonal_stats_nodata_default():
         ),
     )
     assert results["population_count"].iloc[0] > 0
+
+# exactextract tests
+def test_create_exactextract_zonal_stats(simple_aoi):
+    terrain_file = "data/sample_terrain.tif"
+    results = rzs.create_exactextract_zonal_stats(
+        simple_aoi,
+        terrain_file,
+        aggregation=[
+            dict(band=1, func=["mean", "max", "min", "stdev"], output="elevation"),
+            dict(band=2, func=["mean", "max", "min", "stdev"], output="elevation")
+        ],    
+    )
+    assert list(results.columns.values) == [
+        "col1",
+        "lat0",
+        "lon0",
+        "lat1",
+        "lon1",
+        "lat2",
+        "lon2",
+        "lat3",
+        "lon3",
+        "geometry",
+        "elevation_mean",
+        "elevation_max",
+        "elevation_min",
+        "elevation_stdev",
+    ]
+
+def test_create_exactextract_zonal_stats_from_file():
+    terrain_file = "data/sample_terrain.tif"
+    file_aoi = "data/simple_aoi.geojson"
+    results = rzs.create_exactextract_zonal_stats(
+        file_aoi,
+        terrain_file,
+        aggregation=[dict(band=1, func=["mean", "min", "max", "stdev"], output="elevation")],
+    )
+    print(results)
+    assert list(results.columns.values) == [
+        "col1",
+        "lat0",
+        "lon0",
+        "lat1",
+        "lon1",
+        "lat2",
+        "lon2",
+        "lat3",
+        "lon3",
+        "geometry",
+        "elevation_mean",
+        "elevation_min",
+        "elevation_max",
+        "elevation_stdev",
+    ]
+
+def test_exactextract_zonal_stats_multiband():
+    aq_file = "data/ph_s5p_AER_AI_340_380.tiff"
+    file_aoi = "data/region3_admin.geojson"
+    file_aoi = gpd.read_file(file_aoi).to_crs("EPSG:3857")
+    results = rzs.create_exactextract_zonal_stats(
+        file_aoi,
+        aq_file,
+        aggregation=[
+            dict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
+            dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
+            dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
+            dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
+        ],
+    )
+    assert list(results.columns.values) == [
+        "Reg_Code",
+        "Reg_Name",
+        "Reg_Alt_Name",
+        "geometry",
+        "band_1_mean",
+        "band_1_sum",
+        "band_2_mean",
+        "band_2_sum",
+        "prefix_mean",
+        "prefix_sum",
+        "aer_ai_mean",
+        "aer_ai_sum",
+        "aer_ai_count"
+    ]
+
+def test_exactextract_zonal_stats_multiband_crs_mismatch():
+    aq_file = "data/ph_s5p_AER_AI_340_380.tiff"
+    file_aoi = "data/region3_admin.geojson"
+    file_aoi = gpd.read_file(file_aoi).to_crs("EPSG:4326")
+    results = rzs.create_exactextract_zonal_stats(
+        file_aoi,
+        aq_file,
+        aggregation=[
+            dict(band=1, func=["mean", "sum"], nodata_val=-9999), # default - band1_mean, band1_sum
+            dict(band=2, func=["mean", "sum"]), # default - band2_mean, band2_sum
+            dict(band=2, func=["mean", "sum"], output="prefix"), # prefix_mean, prefix_sum
+            dict(band=1, func=["mean", "sum", "count"], output=["aer_ai_mean", "aer_ai_sum", "aer_ai_count"]),
+        ],
+    )
+    assert list(results.columns.values) == [
+        "Reg_Code",
+        "Reg_Name",
+        "Reg_Alt_Name",
+        "geometry",
+        "band_1_mean",
+        "band_1_sum",
+        "band_2_mean",
+        "band_2_sum",
+        "prefix_mean",
+        "prefix_sum",
+        "aer_ai_mean",
+        "aer_ai_sum",
+        "aer_ai_count"
+    ]
+
+def test_exactextract_geojson(simple_aoi):
+    "Check if output returns pandas output despite output option"
+    terrain_file = "data/sample_terrain.tif"
+    results = rzs.create_exactextract_zonal_stats(
+        simple_aoi,
+        terrain_file,
+        aggregation=[
+            dict(band=1, func=["sum"], output="elevation"),
+        ],
+        extra_args = dict(output="geojson")
+    )
+    assert isinstance(results, (pd.DataFrame, gpd.GeoDataFrame))
+
+def test_exactextract_include_cols(simple_aoi):
+    "Check if output returns specified columns"
+    terrain_file = "data/sample_terrain.tif"
+    results = rzs.create_exactextract_zonal_stats(
+        simple_aoi,
+        terrain_file,
+        aggregation=[
+            dict(band=1, func=["sum"], output="elevation"),
+        ],
+        include_cols=["col1", "lat0"]
+    )
+    assert list(results.columns.values) == [
+        "col1",
+        "lat0",
+        "elevation_sum",
+    ]
+
+def test_exactextract_include_geom(simple_aoi):
+    "Check if output returns pandas output despite output option"
+    terrain_file = "data/sample_terrain.tif"
+    results = rzs.create_exactextract_zonal_stats(
+        simple_aoi,
+        terrain_file,
+        aggregation=[
+            dict(band=1, func=["sum"], output="elevation"),
+        ],
+        extra_args = dict(include_geom=True)
+    )
+    assert isinstance(results, (pd.DataFrame, gpd.GeoDataFrame))