Updated docstrings and comments for release

dwest77a · dwest77a · commit 19e0ab366fd5 · 2024-08-23T15:59:54.000+01:00
diff --git a/XarrayActive/active_chunk.py b/XarrayActive/active_chunk.py
@@ -4,7 +4,7 @@
 
 class ActiveOptionsContainer:
     """
-    Container for ActiveOptions properties.
+    Container for ActiveOptions properties. Only for use within XarrayActive.
     """
     @property
     def active_options(self):
@@ -39,19 +39,24 @@ def _set_active_options(self, chunks={}, chunk_limits=True):
         self._active_chunks = chunks
         self._chunk_limits = chunk_limits
 
-# Holds all Active routines.
 class ActiveChunk:
+    """
+    Container class for all Active-required methods to perform on each chunk. 
+    All active-per-chunk content should be found here.
+    """
 
-    description = "Container class for Active routines performed on each chunk. All active-per-chunk content can be found here."
+    description = "Container class for Active routines performed on each chunk."
     
     def _post_process_data(self, data):
-        # Perform any post-processing steps on the data here
+        """
+        Perform any post-processing steps on the data here.
+        """
         return data
 
     def _standard_sum(self, axes=None, skipna=None, **kwargs):
         """
-        Standard Mean routine matches the normal routine for dask, required at this
-        stage if Active mean not available.
+        Standard sum routine matches the normal routine for dask, required at this
+        stage if Active mean/sum not available.
         """
 
         arr = np.array(self)
@@ -62,12 +67,31 @@ def _standard_sum(self, axes=None, skipna=None, **kwargs):
         return total
     
     def _standard_max(self, axes=None, skipna=None, **kwargs):
+        """
+        Standard max routine if Active not available, warning will be given.
+        Kwargs may be necessary to add here.
+        """
         return np.max(self, axis=axes)
     
     def _standard_min(self, axes=None, skipna=None, **kwargs):
+        """
+        Standard min routine if Active not available, warning will be given.
+        Kwargs may be necessary to add here.
+        """
         return np.min(self, axis=axes)
 
     def _numel(self, method, axes=None):
+        """
+        Number of elements remaining after a reduction, to allow
+        dask to combine reductions from all different chunks.
+        Example:
+            (2,3,4) chunk reduced along second dimension. Will
+            give a (2,3) array where each value is 4 - for the 
+            length of the dimension along which a reduction
+            took place.
+
+        """
+        # Applied reduction across all axes
         if not axes:
             return self.size
         
@@ -98,20 +122,20 @@ def active_method(self, method, axis=None, skipna=None, **kwargs):
             'max' : self._standard_max,
             'min' : self._standard_min
         }
-        ret = None
+        partial = None
         n = self._numel(method, axes=axis)
 
         try:
             from activestorage.active import Active
         except ImportError:
             # Unable to import Active package. Default to using normal mean.
             print("ActiveWarning: Unable to import active module - defaulting to standard method.")
-            ret = {
+            partial = {
                 'n': n,
                 'total': standard_methods[method](axes=axis, skipna=skipna, **kwargs)
             }
 
-        if not ret:
+        if not partial:
             
             # Create Active client
             active = Active(self.filename, self.address)
@@ -131,13 +155,14 @@ def active_method(self, method, axis=None, skipna=None, **kwargs):
                 data   = active[extent]
                 t = self._post_process_data(data) * n
 
-                ret = {
+                partial = {
                     'n': n,
                     'total': t
                 }
 
-        if not ret:
+        if not partial:
             # Experimental Recursive requesting to get each 1D column along the axes being requested.
+            # - May be very bad performance due to many requests for (1,1,X) shapes
             range_recursives = []
             for dim in range(self.ndim):
                 if dim not in axis:
@@ -147,17 +172,21 @@ def active_method(self, method, axis=None, skipna=None, **kwargs):
             results = np.array(self._get_elements(active, range_recursives, hyperslab=[]))
 
             t = self._post_process_data(results) * n
-            ret = {
+            partial = {
                 'n': n,
                 'total': t
             }
 
         if method == 'mean':
-            return ret
+            return partial
         else:
-            return ret['total']/ret['n']
+            return partial['total']/partial['n']
 
     def _get_elements(self, active, recursives, hyperslab=[]):
+        """
+        Recursive function to fetch and arrange the appropriate column slices
+        from Active.
+        """
         dimarray = []
         if not len(recursives) > 0:
 
diff --git a/XarrayActive/active_dask.py b/XarrayActive/active_dask.py
@@ -6,6 +6,8 @@
 
 from .active_chunk import ActiveChunk
 
+## Partition Methods are the first step in the Dask Reductions.
+
 def partition_mean(arr, *args, **kwargs):
     return partition_method(arr, 'mean', *args, **kwargs)
     
@@ -29,6 +31,10 @@ def partition_method(arr, method, *args, **kwargs):
         # Computing meta - dask operation not fully utilised.
         return None
 
+## Combining results from Partition methods
+# - Dask built-in mean-agg and mean-combine for mean.
+# - Min/Max/Sum require simple functions for combine/aggregation.
+
 def general_combine(pairs, axis=None):
     if not isinstance(pairs, list):
         pairs = [pairs]
@@ -49,6 +55,7 @@ class DaskActiveArray(da.Array):
 
     @property
     def is_active(self):
+        # Quick way of distinguishing from Dask Array
         return True
 
     def copy(self):
diff --git a/XarrayActive/active_xarray.py b/XarrayActive/active_xarray.py
@@ -14,53 +14,18 @@ class ActiveDataArray(DataArray):
     # No additional properties
     __slots__ = ()
 
-    def mean(
-        self,
-        *args,
-        **kwargs,
-    ):
-        
-        return self._active_op(
-            dataarray_active_mean,
-            *args,
-            **kwargs,
-        )
+    # Override Xarray DataArray standard functions in favour of Active enabled ones.
+    def mean(self, *args,**kwargs):
+        return self._active_op(dataarray_active_mean, *args, **kwargs)
     
-    def max(
-        self,
-        *args,
-        **kwargs,
-    ):
-        
-        return self._active_op(
-            dataarray_active_max,
-            *args,
-            **kwargs,
-        )
+    def max(self, *args,**kwargs):
+        return self._active_op(dataarray_active_max, *args, **kwargs)
     
-    def min(
-        self,
-        *args,
-        **kwargs,
-    ):
-        
-        return self._active_op(
-            dataarray_active_min,
-            *args,
-            **kwargs,
-        )
+    def min(self, *args,**kwargs):
+        return self._active_op(dataarray_active_min, *args, **kwargs)
     
-    def sum(
-        self,
-        *args,
-        **kwargs,
-    ):
-        
-        return self._active_op(
-            dataarray_active_sum,
-            *args,
-            **kwargs,
-        )
+    def sum(self, *args,**kwargs):
+        return self._active_op(dataarray_active_sum, *args, **kwargs)
     
     def _active_op(
         self,
@@ -72,7 +37,7 @@ def _active_op(
         **kwargs,
     ):
         """
-        Reduce this DataArray's data by applying an operation along some dimension(s).
+        Reduce this DataArray's data by applying an ``active`` operation along some dimension(s).
 
         Parameters
         ----------
@@ -96,21 +61,16 @@ def _active_op(
         Returns
         -------
         reduced : DataArray
-            New DataArray with ``max`` applied to its data and the
+            New DataArray with reduction applied to its data and the
             indicated dimension(s) removed
 
-        See Also
-        --------
-        numpy.max
-        dask.array.max
         """
         return self.reduce(
             op,
             dim=dim,
             skipna=skipna,
             keep_attrs=keep_attrs,
-            **kwargs,
-        )
+            **kwargs)
         
 class ActiveDataset(Dataset):
 
@@ -150,6 +110,7 @@ def _construct_dataarray(self, name):
             fastpath=True
         )
     
+## DataArray methods to apply to the DaskActiveArray
 def dataarray_active_mean(array, *args, **kwargs):
     return dataarray_active_method(array, 'mean', *args, **kwargs)
 
@@ -164,7 +125,7 @@ def dataarray_active_sum(array, *args, **kwargs):
 
 def dataarray_active_method(array: DaskActiveArray, method: str, axis=None, skipna=None, **kwargs):
     """
-    Function provided to dask reduction, activates the ``active_mean`` method of the ``DaskActiveArray``.
+    Function provided to dask reduction, activates the ``active`` methods of the ``DaskActiveArray``.
 
     :param array:       (obj) A DaskActiveArray object which has additional methods enabling Active operations.
 
@@ -183,6 +144,7 @@ def dataarray_active_method(array: DaskActiveArray, method: str, axis=None, skip
         'sum': array.active_sum
     }
 
+    # On failure of the Active method, can use Duck methods instead - normal behaviour.
     duck_methods = {
         'mean': duck_array_ops.mean,
         'max': duck_array_ops.max,
diff --git a/XarrayActive/datastore.py b/XarrayActive/datastore.py
@@ -19,18 +19,28 @@ class ActiveDataStore(NetCDF4DataStore, ActiveOptionsContainer):
 
     def get_variables(self):
         """
+        Override normal store behaviour to allow opening some variables 'actively'
         """
         return FrozenDict(
             (k, self.open_variable(k, v)) for k, v in self.ds.variables.items()
         )
     
     def open_variable(self, name: str, var):
+        """
+        Allow opening some variables 'actively', if they are not a dimension (where
+        you'll want the whole array anyway) and where the active chunks are specified
+        - required by XarrayActive.
+        """
         if name in self.ds.dimensions or not self._active_chunks:
             return self.open_store_variable(name, var)
         else:
             return self.open_active_variable(name, var)
 
     def open_active_variable(self, name: str, var):
+        """
+        Utilise the ActiveArrayWrapper builder to obtain the data
+        Lazily for this variable so active methods can be applied later.
+        """
         import netCDF4
 
         dimensions = var.dimensions
@@ -52,6 +62,7 @@ def open_active_variable(self, name: str, var):
             )
         )
         
+        # Everything after this point is normal store behaviour
         encoding   = {}
 
         if isinstance(var.datatype, netCDF4.EnumType):
diff --git a/XarrayActive/wrappers.py b/XarrayActive/wrappers.py
@@ -6,11 +6,9 @@
     get_chunk_positions,
     get_chunk_extent,
     get_dask_chunks,
-    combine_slices,
-    normalize_partition_chunks
+    combine_slices
 )
 from .active_chunk import (
-    ActiveChunk, 
     ActiveOptionsContainer
 )
 
@@ -23,8 +21,8 @@
 
 class ActivePartition(ArrayPartition):
     """
-    Combines ActiveChunk - active methods, and ArrayPartition - array methods
-    into a single ChunkWrapper class. 
+    Container for future ActivePartition behaviour, may not be required unless
+    additional behaviour is required.
     """
     def copy(self, extent=None):
 
@@ -66,6 +64,7 @@ def __init__(
         super().__init__(shape, units=units, dtype=dtype)
 
         # Further work required to get this to work - 23/08/24
+
         #self._active_chunks = normalize_partition_chunks(
         #    self._active_chunks,
         #    self.shape,
@@ -102,7 +101,7 @@ def __array__(self, *args, **kwargs):
             return self._variable
         else:
 
-            # for every dask chunk return a smaller object with the right extent.
+            # For every dask chunk return a smaller object with the right extent.
             # Create a chunk_shape tuple from chunks and _variable (figure out which chunk and which axis, divide etc.)
             # Define a subarray for each chunk, with appropriate index.