From 4c68ca0fafce1f03677515e5b48ddb395e30561a Mon Sep 17 00:00:00 2001
From: iosefa <ipercival@gmail.com>
Date: Sun, 16 Mar 2025 20:08:09 -1000
Subject: [PATCH 1/3] Add optional figure title parameter to plot_2d function

This update introduces the `fig_title` parameter to the `plot_2d` function, allowing custom titles for visualizations. If not provided, a default title based on x and y dimensions is generated. This enhances flexibility and improves customization in plots.
---
 pyforestscan/visualize.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pyforestscan/visualize.py b/pyforestscan/visualize.py
index 676502d..630d17d 100644
--- a/pyforestscan/visualize.py
+++ b/pyforestscan/visualize.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 
-def plot_2d(points, x_dim='X', y_dim='Z', color_map='viridis', alpha=1.0, point_size=1, fig_size=None):
+def plot_2d(points, x_dim='X', y_dim='Z', color_map='viridis', alpha=1.0, point_size=1, fig_size=None, fig_title=None):
     """
     Plots a 2D scatter plot from a set of points with customizable dimensions, colors, and sizes.
 
@@ -41,12 +41,15 @@ def plot_2d(points, x_dim='X', y_dim='Z', color_map='viridis', alpha=1.0, point_
             scale_factor = max_fig_size / max(fig_size)
             fig_size = (fig_size[0] * scale_factor, fig_size[1] * scale_factor)
 
+    if fig_title is None:
+        fig_title = f'{x_dim} vs {y_dim} Colored by Height Above Ground'
+
     plt.figure(figsize=fig_size)
 
     plt.scatter(x, y, c=colors, cmap=color_map, alpha=alpha, s=point_size)
     plt.xlabel(x_dim)
     plt.ylabel(y_dim)
-    plt.title(f'{x_dim} vs {y_dim} Colored by Height Above Ground')
+    plt.title(fig_title)
     plt.colorbar(label='Height Above Ground (m)')
     plt.show()
 

From 6c852a900c609d0e07a7ca4fb7e2c9bb57a52ebf Mon Sep 17 00:00:00 2001
From: iosefa <ipercival@gmail.com>
Date: Sat, 22 Mar 2025 20:26:10 -1000
Subject: [PATCH 2/3] Refactor plotting functions for enhanced flexibility.

Improved `plot_2d` to support slicing and customizable colorbars, while enhancing `plot_pad` with collapsing and dynamic titles. Added parameter validation and detailed documentation for better usability and clarity.
---
 pyforestscan/visualize.py | 194 +++++++++++++++++++++++++-------------
 1 file changed, 128 insertions(+), 66 deletions(-)

diff --git a/pyforestscan/visualize.py b/pyforestscan/visualize.py
index 630d17d..fd7b6c2 100644
--- a/pyforestscan/visualize.py
+++ b/pyforestscan/visualize.py
@@ -2,47 +2,75 @@
 import numpy as np
 
 
-def plot_2d(points, x_dim='X', y_dim='Z', color_map='viridis', alpha=1.0, point_size=1, fig_size=None, fig_title=None):
+def plot_2d(points, x_dim='X',
+            y_dim='Z', color_by='HeightAboveGround',
+            color_map='viridis', colorbar_label=None,
+            alpha=1.0, point_size=1,
+            fig_size=None, fig_title=None,
+            slice_dim=None, slice_val=0.0,
+            slice_tolerance=5
+):
     """
-    Plots a 2D scatter plot from a set of points with customizable dimensions, colors, and sizes.
-
-    :param points: A DataFrame containing point data with dimensions and 'HeightAboveGround'.
-    :type points: pandas.DataFrame
-    :param x_dim: The dimension for the X-axis ('X', 'Y', 'Z', or 'HeightAboveGround').
-    :type x_dim: str
-    :param y_dim: The dimension for the Y-axis ('X', 'Y', 'Z', or 'HeightAboveGround').
-    :type y_dim: str
-    :param color_map: The colormap used to color the points according to 'HeightAboveGround'.
-    :type color_map: str
-    :param alpha: The transparency level of the points.
-    :type alpha: float
-    :param point_size: The size of each point in the scatter plot.
-    :type point_size: float
-    :param fig_size: The size of the figure as a tuple (width, height). If None, it is computed based on the aspect ratio.
-    :type fig_size: tuple or None
-    :return: None
-    :rtype: None
-    :raises ValueError: If the provided dimensions are not valid.
+    Plots a 2D scatter plot of data points with customizable axes, coloring, and display settings.
+
+    Args:
+        points (np.ndarray): A structured array with named columns like 'X','Y','Z','Classification', etc.
+        x_dim (str): The column name for the horizontal axis. Defaults to 'X'.
+        y_dim (str): The column name for the vertical axis. Defaults to 'Z'.
+        color_by (str): The column name used to color the points. Defaults to 'HeightAboveGround'.
+                       If 'HeightAboveGround' doesn't exist, pick e.g. 'Classification' or another column.
+        color_map (str): A valid matplotlib colormap name. Defaults to 'viridis'.
+        colorbar_label (str): Label for the colorbar. If None, uses color_by.
+        alpha (float): Transparency of the points (0 to 1). Defaults to 1.0 (opaque).
+        point_size (int): Size of the scatter points. Defaults to 1.
+        fig_size (tuple): (width, height) in inches. If None, auto-derive from data aspect ratio.
+        fig_title (str): Title of the plot. If None, auto-generated.
+        slice_dim (str): If provided, dimension to "fix" at a slice_val (e.g. 'Y').
+        slice_val (float): The coordinate value at which to slice the slice_dim dimension.
+        slice_tolerance (float): Allowed absolute difference when matching slice_val.
+                                 (Helps with floating-point comparisons.)
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If the `x_dim` or `y_dim` parameter is not one of ['X', 'Y', 'Z',
+            'HeightAboveGround'].
     """
     valid_dims = ['X', 'Y', 'Z', 'HeightAboveGround']
     if x_dim not in valid_dims or y_dim not in valid_dims:
         raise ValueError(f"Invalid dimensions. Choose from: {valid_dims}")
 
+    required_dims = [x_dim, y_dim, color_by]
+    if slice_dim is not None:
+        required_dims.append(slice_dim)
+
+    for dim in required_dims:
+        if dim not in points.dtype.names:
+            raise ValueError(f"'{dim}' not found in array dtype names: {points.dtype.names}")
+
+    if slice_dim and slice_dim in points.dtype.names:
+        mask = np.isclose(points[slice_dim], slice_val, atol=slice_tolerance, rtol=0)
+        points = points[mask]
+
+    if colorbar_label is None:
+        colorbar_label = color_by
+
     x = points[x_dim]
     y = points[y_dim]
-    colors = points['HeightAboveGround']
+    colors = points[color_by]
 
     if fig_size is None:
         aspect_ratio = (np.max(x) - np.min(x)) / (np.max(y) - np.min(y))
         fig_size = (10 * aspect_ratio, 10)
 
-        max_fig_size = 20  # inches
+        max_fig_size = 20
         if max(fig_size) > max_fig_size:
             scale_factor = max_fig_size / max(fig_size)
             fig_size = (fig_size[0] * scale_factor, fig_size[1] * scale_factor)
 
     if fig_title is None:
-        fig_title = f'{x_dim} vs {y_dim} Colored by Height Above Ground'
+        fig_title = f'{x_dim} vs {y_dim} Colored by {color_by}'
 
     plt.figure(figsize=fig_size)
 
@@ -50,9 +78,10 @@ def plot_2d(points, x_dim='X', y_dim='Z', color_map='viridis', alpha=1.0, point_
     plt.xlabel(x_dim)
     plt.ylabel(y_dim)
     plt.title(fig_title)
-    plt.colorbar(label='Height Above Ground (m)')
+    plt.colorbar(label=colorbar_label)
     plt.show()
 
+
 def plot_metric(title, metric, extent, metric_name=None, cmap='viridis', fig_size=None):
     """
     Plots a given metric using the provided data and configuration.
@@ -96,57 +125,85 @@ def plot_metric(title, metric, extent, metric_name=None, cmap='viridis', fig_siz
     plt.ylabel('Y')
     plt.show()
 
-def plot_pad(pad, slice_index, axis='x', cmap='viridis', hag_values=None, horizontal_values=None):
+
+def plot_pad(pad, slice_index=None, axis='x', cmap='viridis',
+             hag_values=None, horizontal_values=None, title=None):
     """
-    Plots a 2D slice of Plant Area Density (PAD) data with dZ HAG on the Y-axis.
-
-    :param pad: numpy.ndarray
-        The 3D Plant Area Density data with shape (X, Y, HAG).
-    :param slice_index: int
-        The index of the slice to be plotted along the specified axis.
-    :param axis: str, optional
-        The axis along which to slice the data. Default is 'x'. Choose from 'x', 'y'.
-    :param cmap: str, optional
-        The colormap to be used for plotting. Default is 'viridis'.
-    :param hag_values: numpy.ndarray, optional
-        Array of HAG (Height Above Ground) values corresponding to the third dimension.
-        If None, will use indices.
-    :param horizontal_values: numpy.ndarray, optional
-        Array of horizontal (X or Y) values corresponding to the first or second dimension.
-        If None, will use indices.
-    :raises ValueError: If an invalid axis is provided or slice_index is out of bounds.
-    :return: None
-    :rtype: None
+    Plots the plant area density (PAD) data as a 2D image visualization.
+
+    This function projects or slices 3D PAD data based on the specified axis and
+    optional slice index. It visualizes the density using a colormap and displays
+    the data with respect to the specified coordinates and labels.
+
+    Args:
+        pad: A 3D numpy array representing PAD values (shape: dZ x Y x X).
+        slice_index: Optional; An integer specifying the index of the 2D slice
+            along the given axis. If None, the PAD data will be collapsed along
+            the axis using the maximum value.
+        axis: A string specifying the axis ('x' or 'y') to use for slicing
+            or projection. Defaults to 'x'.
+        cmap: A string specifying the colormap to use for visualization.
+            Defaults to 'viridis'.
+        hag_values: Optional; A 1D numpy array representing height above ground
+            (dZ) values. If None, an array from 0 to the size of the dZ axis
+            will be used.
+        horizontal_values: Optional; A 1D numpy array representing horizontal
+            axis values (X or Y, depending on `axis`). If None, an array from
+            0 to the corresponding size of the horizontal axis will be used.
+        title: Optional; A string specifying the title of the plot. If None, an
+            appropriate default title will be generated based on the input
+            parameters.
+
+    Returns:
+        None. The function visualizes the PAD data using matplotlib.
+
+    Raises:
+        ValueError: If `axis` is not 'x' or 'y'.
+        ValueError: If `slice_index` is out of the valid range for the specified
+            axis.
+        ValueError: If the length of `horizontal_values` does not match the
+            dimension of the specified axis.
     """
-    # Validate the axis parameter
+    # Validate axis
     if axis not in ['x', 'y']:
         raise ValueError(f"Invalid axis: '{axis}'. Choose from 'x' or 'y'.")
 
+    hag_values = hag_values if hag_values is not None else np.arange(pad.shape[2])
+
     if axis == 'x':
-        if slice_index < 0 or slice_index >= pad.shape[0]:
-            raise ValueError(f"slice_index {slice_index} out of range for axis 'x' with size {pad.shape[0]}")
-        pad_2d = pad[slice_index, :, :]
-        horizontal_axis_label = 'Y'
-        horizontal_axis_values = horizontal_values if horizontal_values is not None else np.arange(pad.shape[1])
-    else:  # axis == 'y'
-        if slice_index < 0 or slice_index >= pad.shape[1]:
-            raise ValueError(f"slice_index {slice_index} out of range for axis 'y' with size {pad.shape[1]}")
-        pad_2d = pad[:, slice_index, :]
-        horizontal_axis_label = 'X'
-        horizontal_axis_values = horizontal_values if horizontal_values is not None else np.arange(pad.shape[0])
+        if slice_index is None:
+            pad_2d = pad.max(axis=0)
+            horizontal_axis_label = 'Y'
+            horizontal_count = pad.shape[1]
+            if horizontal_values is not None and len(horizontal_values) != horizontal_count:
+                raise ValueError("Length of horizontal_values does not match the Y dimension of pad.")
+            horizontal_axis_values = horizontal_values if horizontal_values is not None else np.arange(horizontal_count)
+
+        else:
+            if slice_index < 0 or slice_index >= pad.shape[0]:
+                raise ValueError(f"slice_index {slice_index} out of range for axis 'x' with size {pad.shape[0]}")
+            pad_2d = pad[slice_index, :, :]
+            horizontal_axis_label = 'Y'
+            horizontal_axis_values = horizontal_values if horizontal_values is not None else np.arange(pad.shape[1])
 
-    hag_values = hag_values if hag_values is not None else np.arange(pad.shape[2])
+    else:  # axis == 'y'
+        if slice_index is None:
+            pad_2d = np.nanmax(pad, axis=1)
+            horizontal_axis_label = 'X'
+            horizontal_count = pad.shape[0]
+            if horizontal_values is not None and len(horizontal_values) != horizontal_count:
+                raise ValueError("Length of horizontal_values does not match the X dimension of pad.")
+            horizontal_axis_values = horizontal_values if horizontal_values is not None else np.arange(horizontal_count)
+
+        else:
+            if slice_index < 0 or slice_index >= pad.shape[1]:
+                raise ValueError(f"slice_index {slice_index} out of range for axis 'y' with size {pad.shape[1]}")
+            pad_2d = pad[:, slice_index, :]
+            horizontal_axis_label = 'X'
+            horizontal_axis_values = horizontal_values if horizontal_values is not None else np.arange(pad.shape[0])
 
     pad_2d = pad_2d.T
 
-    if horizontal_values is not None:
-        if axis == 'x' and len(horizontal_values) != pad.shape[1]:
-            raise ValueError("Length of horizontal_values does not match the Y dimension of pad.")
-        if axis == 'y' and len(horizontal_values) != pad.shape[0]:
-            raise ValueError("Length of horizontal_values does not match the X dimension of pad.")
-    else:
-        horizontal_axis_values = np.arange(pad.shape[1]) if axis == 'x' else np.arange(pad.shape[0])
-
     plt.figure(figsize=(10, 6))
     img = plt.imshow(
         pad_2d,
@@ -161,7 +218,12 @@ def plot_pad(pad, slice_index, axis='x', cmap='viridis', hag_values=None, horizo
         aspect='auto'
     )
     plt.colorbar(img, label='PAD')
-    plt.title(f'Plant Area Density (PAD) - {horizontal_axis_label} vs dZ at Slice {slice_index}')
+    if title is None:
+        if slice_index is None:
+            title = f'Plant Area Density (PAD) - Collapsed by max along axis {axis}'
+        else:
+            title = f'Plant Area Density (PAD) - {horizontal_axis_label} vs dZ at slice {slice_index}'
+    plt.title(title)
     plt.xlabel(horizontal_axis_label)
     plt.ylabel('dZ')
     plt.tight_layout()

From f59831c63ca472ca21b81daecb12eff607037232 Mon Sep 17 00:00:00 2001
From: iosefa <ipercival@gmail.com>
Date: Sat, 22 Mar 2025 20:26:47 -1000
Subject: [PATCH 3/3] Handle NaN values in PAD and PAI calculations

Updated PAD calculation to set columns with zero returns to NaN and adjusted PAI to handle NaN values properly. Modified tests to reflect these changes and ensure consistency. Removed redundant content from the example notebook for clarity and brevity.
---
 ...rted-importing-preprocessing-dtm-chm.ipynb | 18 +++----
 pyforestscan/calculate.py                     | 17 +++++--
 pyforestscan/filters.py                       | 49 +++++++++++++------
 tests/test_calculate.py                       |  4 +-
 4 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/docs/examples/getting-started-importing-preprocessing-dtm-chm.ipynb b/docs/examples/getting-started-importing-preprocessing-dtm-chm.ipynb
index bcaa467..7bd8613 100644
--- a/docs/examples/getting-started-importing-preprocessing-dtm-chm.ipynb
+++ b/docs/examples/getting-started-importing-preprocessing-dtm-chm.ipynb
@@ -14,7 +14,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2024-09-20T09:26:40.870714Z",
@@ -77,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -94,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -110,7 +110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -138,7 +138,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -147,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -163,7 +163,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -172,7 +172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -207,7 +207,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,
diff --git a/pyforestscan/calculate.py b/pyforestscan/calculate.py
index 76f7ee6..1d2918a 100644
--- a/pyforestscan/calculate.py
+++ b/pyforestscan/calculate.py
@@ -100,11 +100,13 @@ def calculate_pad(voxel_returns, voxel_height, beer_lambert_constant=None):
     """
     Calculate the Plant Area Density (PAD) using the Beer-Lambert Law.
 
-    :param voxel_returns: numpy array representing the returns from each voxel (x, y, z).
+    :param voxel_returns: numpy array of shape (X, Y, Z) representing
+                          the returns in each voxel column.
     :param voxel_height: float, height of each voxel.
-    :param beer_lambert_constant: Optional Beer-Lambert constant, defaults to 1 if not provided.
+    :param beer_lambert_constant: optional float. If not provided, defaults to 1.
 
-    :return: numpy array containing PAD values for each voxel.
+    :return: A numpy array containing PAD values for each voxel (same shape as voxel_returns).
+             Columns that have zero returns across all Z are set to NaN.
     """
     shots_in = np.cumsum(voxel_returns[::-1], axis=2)[::-1]
     shots_through = shots_in - voxel_returns
@@ -117,6 +119,9 @@ def calculate_pad(voxel_returns, voxel_height, beer_lambert_constant=None):
         pad = np.log(division_result) * (1 / (beer_lambert_constant or 1) / voxel_height)
 
     pad = np.where(np.isfinite(pad) & (pad > 0), pad, 0)
+    sum_across_z = np.sum(voxel_returns, axis=2, keepdims=True)
+    empty_mask = (sum_across_z == 0)
+    pad = np.where(empty_mask, np.nan, pad)
 
     return pad
 
@@ -136,7 +141,11 @@ def calculate_pai(pad, min_height=1, max_height=None):
     if min_height >= max_height:
         raise ValueError("Minimum height index must be less than maximum height index.")
 
-    pai = np.nansum(pad[:, :, min_height:max_height], axis=2)
+    slice_3d = pad[:, :, min_height:max_height]
+
+    sum_vals = np.nansum(slice_3d, axis=2)
+    count_non_nan = np.sum(~np.isnan(slice_3d), axis=2)
+    pai = np.where(count_non_nan == 0, np.nan, sum_vals)
 
     return pai
 
diff --git a/pyforestscan/filters.py b/pyforestscan/filters.py
index eb48890..64adfa5 100644
--- a/pyforestscan/filters.py
+++ b/pyforestscan/filters.py
@@ -47,22 +47,31 @@ def filter_select_ground(arrays):
     return pipeline.arrays
 
 
-def remove_outliers_and_clean(arrays, mean_k=8, multiplier=3.0):
+def remove_outliers_and_clean(arrays, mean_k=8, multiplier=3.0, remove=False):
     """
-    Removes statistical outliers from the point cloud to enhance data quality.
-
-    :param arrays: list
-        List of point cloud arrays obtained from read_lidar.
-    :type point_cloud_arrays: list
-    :param mean_k: int, number of nearest neighbors for outlier removal.
-    :param multiplier: float, standard deviation multiplier for outlier removal.
-    :return: list
-        Cleaned array of point cloud data without outliers.
-    :rtype: list
-    :raises RuntimeError:
-        If there's an error during the pipeline execution.
-    :raises ValueError:
-        If no data is returned after outlier removal.
+    Processes input arrays by removing statistical outliers and optionally cleans
+    the data, filtering out specific classifications.
+    By default, this function only labels outliers as "7" (outlier).
+
+    # todo: this function will be renamed in a future release.
+
+    Args:
+        arrays: The input arrays to process, typically a list of structured arrays or
+            similar data types.
+        mean_k (int, optional): The number of neighbors to analyze for each point for
+            statistical outlier removal. Default is 8.
+        multiplier (float, optional): The multiplication factor for the standard deviation
+            threshold to classify an outlier. Default is 3.0.
+        remove (bool, optional): If True, additional cleaning is performed to remove points
+            specifically classified as outliers. Defaults to False.
+
+    Returns:
+        list: A list of processed arrays after outlier removal, and optionally after
+        cleaning, if specified.
+
+    Raises:
+        RuntimeError: If the outlier removal pipeline fails during execution.
+        ValueError: If no data is returned after applying outlier removal.
     """
     pipeline_stages = [
         _filter_statistical_outlier(mean_k=mean_k, multiplier=multiplier)
@@ -73,7 +82,15 @@ def remove_outliers_and_clean(arrays, mean_k=8, multiplier=3.0):
     except RuntimeError as e:
         raise RuntimeError(f"Outlier Removal Pipeline Failed: {e}")
 
-    processed_arrays = pipeline.arrays
+    if remove:
+        clean_arrays = []
+        for arr in pipeline.arrays:
+            mask = (arr["Classification"] != 7)
+            cleaned = arr[mask]
+            clean_arrays.append(cleaned)
+        processed_arrays = clean_arrays
+    else:
+        processed_arrays = pipeline.arrays
 
     if not processed_arrays:
         raise ValueError("No data returned after outlier removal.")
diff --git a/tests/test_calculate.py b/tests/test_calculate.py
index dbeda26..91024c6 100644
--- a/tests/test_calculate.py
+++ b/tests/test_calculate.py
@@ -140,8 +140,8 @@ def test_calculate_pad_with_zero_shots():
     pad = calculate_pad(voxel_returns, voxel_height)
     assert isinstance(pad, np.ndarray)
     assert pad.shape == voxel_returns.shape
-    # PAD should be zero where shots_through == 0
-    assert np.all(pad == 0)
+    # PAD should be nan where shots_through == 0
+    assert np.all(np.isnan(pad))
 
 
 def test_calculate_pad_with_inf_nan_values():