perf: zonal stats

nicolasK · nicolasK · commit 97fe2f63f30c · 2024-08-09T17:39:18.000+02:00
diff --git a/earthdaily/earthdatastore/cube_utils/__init__.py b/earthdaily/earthdatastore/cube_utils/__init__.py
@@ -7,7 +7,7 @@
 from rasterio.enums import Resampling
 from shapely.geometry import box
 from .geometry_manager import GeometryManager
-from ._zonal import zonal_stats, zonal_stats_numpy
+from ._zonal import zonal_stats
 from .harmonizer import Harmonizer
 from .asset_mapper import AssetMapper
 import rioxarray
diff --git a/earthdaily/earthdatastore/cube_utils/_zonal.py b/earthdaily/earthdatastore/cube_utils/_zonal.py
@@ -9,7 +9,8 @@
 import numpy as np
 import xarray as xr
 import tqdm
-
+import logging
+import time
 from . import custom_reducers
 from .preprocessing import rasterize
 from scipy.sparse import csr_matrix
@@ -47,197 +48,134 @@ def _rasterize(gdf, dataset, all_touched=False):
     yx_pos = _indices_sparse(feats)
     return feats, yx_pos
 
-
-def zonal_stats_numpy(
-    dataset,
+def _memory_time_chunks(dataset, memory=None):
+    import psutil
+    if memory is None:    
+        memory = psutil.virtual_memory().available/1e6
+        logging.debug(f"Hoping to use a maximum memory {memory}Mo.")
+    nbytes_per_date = int(dataset.nbytes/1e6)/dataset.time.size*3
+    max_time_chunks = int(np.arange(0,memory,nbytes_per_date+0.1).size)
+    time_chunks = int(dataset.time.size/np.arange(0,dataset.time.size,max_time_chunks).size)
+    logging.debug(f"Mo per date : {nbytes_per_date:0.2f}, total : {(nbytes_per_date*dataset.time.size):0.2f}.")
+    logging.debug(f"Time chunks : {time_chunks} (on {dataset.time.size} time).")
+    return time_chunks
+
+def _zonal_stats_numpy(
+    dataset:xr.Dataset,
     positions,
-    gdf,
-    operations=dict(mean=np.nanmean),
-    all_touched=False,
-):
-    def _get_field_dataset(positions, dataset):
-        for idx, pos in enumerate(positions):
-            if pos.size == 0:
-                continue
-            pos_xr = dict(
-                x=xr.DataArray(pos[1], dims="z"), y=xr.DataArray(pos[0], dims="z")
-            )
-            yield idx, dataset.isel(**pos_xr)
-
-    def _zonal_stats_from_field(dc_field, operations, idx):
-        return xr.concat(
-            [
-                getattr(dc_field, reducer)("z").expand_dims(
-                    feature=[idx], zonal_stats=[reducer]
-                )
-                for reducer in operations.keys()
-            ],
-            dim="zonal_stats",
-        )
-
-    def compute_zonal_stats_apply_ufunc(dataset, positions, reducers):
+    reducers:list=['mean'],
+    all_touched=False):
+
+    def _zonal_stats_ufunc(dataset, positions, reducers):
         zs = []
         for idx in range(len(positions)):
             field_stats = []
             for reducer in reducers:
                 field_arr = dataset[..., *positions[idx]]
-                field_arr = reducer(field_arr, axis=-1)
+                func = f'nan{reducer}' if hasattr(np,f"nan{reducer}") else reducer
+                field_arr = getattr(np, func)(field_arr, axis=-1)
                 field_stats.append(field_arr)
             field_stats = np.asarray(field_stats)
             zs.append(field_stats)
         zs = np.asarray(zs)
-        zs = zs.swapaxes(-2, 0)
+        zs = zs.swapaxes(-1, 0).swapaxes(-1,-2)
         return zs
-
-    result = xr.apply_ufunc(
-        compute_zonal_stats_apply_ufunc,
-        dataset.to_dataarray(dim="band"),
+    
+    
+    dask_ufunc  = "parallelized"
+        
+    zs = xr.apply_ufunc(
+        _zonal_stats_ufunc,
+        dataset,
         vectorize=False,
-        dask="forbidden",
-        input_core_dims=[["band", "y", "x"]],
-        output_core_dims=[["zonal_stats", "feature", "band"]],
+        dask=dask_ufunc,
+        input_core_dims=[["y","x"]],
+        output_core_dims=[["feature", "zonal_statistics"]],
         exclude_dims=set(["x", "y"]),
         output_dtypes=[float],
-        output_sizes=dict(feature=len(positions), zonal_stats=len(operations.values())),
-        kwargs=dict(reducers=operations.values(), positions=positions),
+        kwargs=dict(reducers=reducers, positions=positions),
+        dask_gufunc_kwargs={"allow_rechunk":True,
+                            "output_sizes":dict(geometry=len(positions), zonal_statistics=len(reducers))}
     )
+
     del dataset
-    return result.to_dataset(dim="band")
-    # zs = []
-    # for idx, dc_field in tqdm.tqdm(_get_field_dataset(positions, dataset),total=gdf.shape[0], mininterval=1, desc="Zonal stats"):
-    #     zs.append(_zonal_stats_from_field(dc_field, operations, idx))
-    # zs = xr.concat(zs, dim='feature')
-    # return zs.transpose("feature", "time", "zonal_stats")
-
-
-def zonal_stats(
-    dataset,
-    gdf,
-    operations: list = ["mean"],
-    all_touched=False,
-    method="geocube",
-    verbose=False,
-    raise_missing_geometry=False,
-):
+    
+    return zs
+
+def zonal_stats(dataset:xr.Dataset,
+                geoms, 
+                method:str="numpy", 
+                smart_load:bool=False, 
+                memory:int = None,
+                reducers:list=['mean'],
+                all_touched = True):
     """
-
+    Xr Zonal stats using np.nan functions.
 
     Parameters
     ----------
     dataset : xr.Dataset
         DESCRIPTION.
-    gdf : gpd.GeoDataFrame
+    geoms : TYPE
         DESCRIPTION.
-    operations : TYPE, list.
-        DESCRIPTION. The default is ["mean"].
-    all_touched : TYPE, optional
-        DESCRIPTION. The default is False.
-    method : TYPE, optional
-        DESCRIPTION. The default is "geocube".
-    verbose : TYPE, optional
-        DESCRIPTION. The default is False.
-    raise_missing_geometry : TYPE, optional
-        DESCRIPTION. The default is False.
-
-    Raises
+    method : str
+        "xvec" or "numpy". The default is "numpy".
+    smart_load : bool
+        Will load in memory the maximum of time and loop on it for "numpy" 
+        method. The default is False.
+    memory : int, optional
+        Only for the "numpy" method, by default it will take the maximum memory
+        available. But in some cases it can be too much or too little.
+        The default is None.
+    reducers : list, optional
+        Any np.nan function ("mean" is "np.nanmean"). The default is ['mean'].
+
+    Yields
     ------
-    ValueError
-        DESCRIPTION.
-    NotImplementedError
-        DESCRIPTION.
-
-    Returns
-    -------
-    TYPE
+    zs : TYPE
         DESCRIPTION.
 
     """
-
-    if method == "geocube":
-        from geocube.api.core import make_geocube
-        from geocube.rasterize import rasterize_image
-
-        def custom_rasterize_image(all_touched=all_touched, **kwargs):
-            return rasterize_image(all_touched=all_touched, **kwargs)
-
-        gdf["tmp_index"] = np.arange(gdf.shape[0])
-        out_grid = make_geocube(
-            gdf,
-            measurements=["tmp_index"],
-            like=dataset,  # ensure the data are on the same grid
-            rasterize_function=custom_rasterize_image,
-        )
-        cube = dataset.groupby(out_grid.tmp_index)
-        zonal_stats = xr.concat(
-            [getattr(cube, operation)() for operation in operations], dim="stats"
-        )
-        zonal_stats["stats"] = operations
-
-        if zonal_stats["tmp_index"].size != gdf.shape[0]:
-            index_list = [
-                gdf.index[i] for i in zonal_stats["tmp_index"].values.astype(np.int16)
-            ]
-            if raise_missing_geometry:
-                diff = gdf.shape[0] - len(index_list)
-                raise ValueError(
-                    f'{diff} geometr{"y is" if diff==1 else "ies are"} missing in the zonal stats. This can be due to too small geometries, duplicated...'
-                )
+    
+    def _loop_time_chunks(dataset, method, smart_load, time_chunks):
+        logging.debug(f"Batching every {time_chunks} dates ({np.ceil(dataset.time.size/time_chunks).astype(int)} loops).")
+        for time_idx in tqdm.trange(0,dataset.time.size,time_chunks):   
+            isel_time = np.arange(time_idx,np.min((time_idx+time_chunks,dataset.time.size)))
+            ds = dataset.copy().isel(time=isel_time)
+            if smart_load:
+                t0 = time.time()
+                ds = ds.load()
+                logging.debug(f'Subdataset of {ds.time.size} dates loaded in memory in {(time.time()-t0):0.2f}s.')
+            t0 = time.time()
+            # for method in tqdm.tqdm(["np"]):
+            zs = _zonal_stats_numpy(ds,
+                                   positions,
+                                   reducers)
+            zs = zs.load()
+            del ds
+            logging.debug(f'Zonal stats computed in {(time.time()-t0):0.2f}s.')
+            yield zs
+
+    t_start = time.time()
+    dataset = dataset.rio.clip_box(*geoms.to_crs(dataset.rio.crs).total_bounds)
+    if method == 'numpy':
+        feats, yx_pos = _rasterize(geoms, dataset, all_touched=all_touched)
+        positions = [np.asarray(yx_pos[i + 1]) for i in np.arange(geoms.shape[0])]
+        positions = [position for position in positions if position.size>0]
+        del feats,yx_pos
+        time_chunks = _memory_time_chunks(dataset, memory)
+        if smart_load:
+            zs = xr.concat([z for z in _loop_time_chunks(dataset, method, smart_load, time_chunks)], dim="time")
         else:
-            index_list = list(gdf.index)
-        zonal_stats["tmp_index"] = index_list
-        return zonal_stats.rename(dict(tmp_index="feature"))
-
-    tqdm_bar = tqdm.tqdm(total=gdf.shape[0])
-
-    if dataset.rio.crs != gdf.crs:
-        Warning(
-            f"Different projections. Reproject vector to EPSG:{dataset.rio.crs.to_epsg()}."
-        )
-        gdf = gdf.to_crs(dataset.rio.crs)
-
-    zonal_ds_list = []
-
-    dataset = dataset.rio.clip_box(*gdf.to_crs(dataset.rio.crs).total_bounds)
-
-    if method == "optimized":
-        feats, yx_pos = _rasterize(gdf, dataset, all_touched=all_touched)
-
-        for gdf_idx in tqdm.trange(gdf.shape[0], disable=not verbose):
-            tqdm_bar.update(1)
-            if gdf_idx + 1 >= len(yx_pos):
-                continue
-            yx_pos_idx = yx_pos[gdf_idx + 1]
-            if np.asarray(yx_pos_idx).size == 0:
-                continue
-            datacube_spatial_subset = dataset.isel(
-                x=xr.DataArray(yx_pos_idx[1], dims="xy"),
-                y=xr.DataArray(yx_pos_idx[0], dims="xy"),
-            )
-            del yx_pos_idx
-            zonal_ds_list.append(
-                datacube_time_stats(datacube_spatial_subset, operations).expand_dims(
-                    dim={"feature": [gdf.iloc[gdf_idx].name]}
-                )
-            )
-
-        del yx_pos, feats
-
-    elif method == "standard":
-        for idx_gdb, feat in tqdm.tqdm(
-            gdf.iterrows(), total=gdf.shape[0], disable=not verbose
-        ):
-            tqdm_bar.update(1)
-            if feat.geometry.geom_type == "MultiPolygon":
-                shapes = feat.geometry.geoms
-            else:
-                shapes = [feat.geometry]
-            datacube_spatial_subset = dataset.rio.clip(shapes, all_touched=all_touched)
-
-            zonal_feat = datacube_time_stats(
-                datacube_spatial_subset, operations
-            ).expand_dims(dim={"feature": [feat.name]})
-
-            zonal_ds_list.append(zonal_feat)
-    else:
-        raise NotImplementedError('method available are : "standard" or "optimized"')
-    return xr.concat(zonal_ds_list, dim="feature")
+            zs = _zonal_stats_numpy(dataset,
+                                   positions,
+                                   reducers)
+        zs = zs.assign_coords(zonal_statistics=reducers)#,feature=geoms.to_crs('EPSG:4326').geometry)
+        
+    if method == "xvec":
+        import xvec
+        zs = dataset.xvec.zonal_stats(geoms.to_crs(dataset.rio.crs).geometry, y_coords='y',x_coords='x', stats=reducers,
+                                      method="rasterize", all_touched=all_touched)
+    logging.info(f"Zonal stats method {method} tooks {time.time()-t_start}s.")
+    del dataset
+    return zs