closes #147, Added support for splitting with function and sklearn estimator. BREAKING CHANGE: renamed apply_partition to split_by_partition for method name consistency.

elphick · elphick · commit eb0c68ba05eb · 2024-05-16T08:02:47.000+08:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,3 +1,13 @@
+Mass_Composition 0.6.0 (2024-05-16)
+===================================
+
+Feature
+-------
+
+- Added support for splitting with function and sklearn estimator.
+- BREAKING CHANGE: renamed apply_partition to split_by_partition for method name consistency. (#147)
+
+
 Mass_Composition 0.5.2 (2024-05-16)
 ===================================
 
diff --git a/elphick/mass_composition/datasets/sample_data.py b/elphick/mass_composition/datasets/sample_data.py
@@ -115,7 +115,7 @@ def size_by_assay_2() -> pd.DataFrame:
     """
     mc_size: MassComposition = MassComposition(size_by_assay(), name='feed')
     partition = partial(napier_munn, d50=0.150, ep=0.1, dim='size')
-    mc_coarse, mc_fine = mc_size.apply_partition(definition=partition, name_1='coarse', name_2='fine')
+    mc_coarse, mc_fine = mc_size.split_by_partition(partition_definition=partition, name_1='coarse', name_2='fine')
     fs: Flowsheet = Flowsheet().from_streams([mc_size, mc_coarse, mc_fine])
     return fs.to_dataframe()
 
@@ -125,7 +125,7 @@ def size_by_assay_3() -> pd.DataFrame:
     """
     mc_size: MassComposition = MassComposition(size_by_assay(), name='feed')
     partition = partial(napier_munn, d50=0.150, ep=0.1, dim='size')
-    mc_coarse, mc_fine = mc_size.apply_partition(definition=partition, name_1='coarse', name_2='fine')
+    mc_coarse, mc_fine = mc_size.split_by_partition(partition_definition=partition, name_1='coarse', name_2='fine')
     # add error to the coarse stream to create an imbalance
     df_coarse_2 = mc_coarse.data.to_dataframe().apply(lambda x: np.random.normal(loc=x, scale=np.std(x)))
     mc_coarse_2: MassComposition = MassComposition(data=df_coarse_2, name='coarse')
@@ -156,7 +156,7 @@ def iron_ore_met_sample_data() -> pd.DataFrame:
 def demo_size_network() -> Flowsheet:
     mc_size: MassComposition = MassComposition(size_by_assay(), name='size sample')
     partition = partial(perfect, d50=0.150, dim='size')
-    mc_coarse, mc_fine = mc_size.apply_partition(definition=partition)
+    mc_coarse, mc_fine = mc_size.split_by_partition(partition_definition=partition)
     mc_coarse.name = 'coarse'
     mc_fine.name = 'fine'
     fs: Flowsheet = Flowsheet().from_streams([mc_size, mc_coarse, mc_fine])
diff --git a/elphick/mass_composition/mass_composition.py b/elphick/mass_composition/mass_composition.py
@@ -495,6 +495,8 @@ def split(self,
 
         A simple mass split maintaining the same composition
 
+        See also: split_by_partition, split_by_function, split_by_estimator
+
         Args:
             fraction: A constant in the range [0.0, 1.0]
             name_1: The name of the reference stream created by the split
@@ -515,18 +517,18 @@ def split(self,
 
         return out, comp
 
-    def apply_partition(self,
-                        definition: Callable,
-                        name_1: Optional[str] = None,
-                        name_2: Optional[str] = None) -> Tuple['MassComposition', 'MassComposition']:
+    def split_by_partition(self,
+                           partition_definition: Callable,
+                           name_1: Optional[str] = None,
+                           name_2: Optional[str] = None) -> Tuple['MassComposition', 'MassComposition']:
         """Partition the object along a given dimension.
 
         This method applies the defined separation resulting in two new objects.
 
-        See also: split
+        See also: split, split_by_function, split_by_estimator
 
         Args:
-            definition: A partition function that defines the efficiency of separation along a dimension
+            partition_definition: A partition function that defines the efficiency of separation along a dimension
             name_1: The name of the reference stream created by the split
             name_2: The name of the complement stream created by the split
 
@@ -536,7 +538,7 @@ def apply_partition(self,
         out = deepcopy(self)
         comp = deepcopy(self)
 
-        xr_ds_1, xr_ds_2 = self._data.mc.apply_partition(definition=definition)
+        xr_ds_1, xr_ds_2 = self._data.mc.split_by_partition(partition_definition=partition_definition)
 
         out._data = xr_ds_1
         comp._data = xr_ds_2
@@ -545,6 +547,67 @@ def apply_partition(self,
 
         return out, comp
 
+    def split_by_function(self,
+                          split_function: Callable,
+                          name_1: Optional[str] = None,
+                          name_2: Optional[str] = None) -> Tuple['MassComposition', 'MassComposition']:
+        """Split an object using a function.
+
+        This method applies the function to self, resulting in two new objects. The object returned with name_1
+        is the result of the function.  The object returned with name_2 is the complement.
+
+        See also: split, split_by_estimator, split_by_partition
+
+        Args:
+            split_function: Any function that transforms the dataframe from a MassComposition object into a new
+             dataframe with values representing a new (output) stream.  The returned dataframe structure must be
+             identical to the input dataframe.
+            name_1: The name of the stream created by the function
+            name_2: The name of the complement stream created by the split, which is calculated automatically.
+
+        Returns:
+            tuple of two datasets, the first with the mass fraction specified, the other the complement
+        """
+        out_data: pd.DataFrame = split_function(self.data.to_dataframe())
+        out: MassComposition = MassComposition(name=name_1, constraints=self.constraints, data=out_data)
+        comp: MassComposition = self.sub(other=out, name=name_2)
+
+        self._post_process_split(out, comp, name_1, name_2)
+
+        return out, comp
+
+    def split_by_estimator(self,
+                           estimator: 'sklearn.base.BaseEstimator',
+                           name_1: Optional[str] = None,
+                           name_2: Optional[str] = None) -> Tuple['MassComposition', 'MassComposition']:
+        """Split an object using a sklearn estimator.
+
+        This method applies the function to self, resulting in two new objects. The object returned with name_1
+        is the result of the estimator.predict() method.  The object returned with name_2 is the complement.
+
+        See also: split, split_by_function, split_by_partition
+
+        Args:
+            estimator: Any sklearn estimator that transforms the dataframe from a MassComposition object into a new
+             dataframe with values representing a new (output) stream using the predict method.  The returned
+             dataframe structure must be identical to the input dataframe.
+            name_1: The name of the stream created by the estimator.
+            name_2: The name of the complement stream created by the split, which is calculated automatically.
+
+        Returns:
+            tuple of two datasets, the first with the mass fraction specified, the other the complement
+        """
+        out_data: Union[pd.DataFrame, np.ndarray] = estimator.predict(self.data.to_dataframe())
+        if isinstance(out_data, np.ndarray):
+            out_data = pd.DataFrame(out_data, index=self.data.to_dataframe().index,
+                                    columns=self.data.to_dataframe().columns)
+        out: MassComposition = MassComposition(name=name_1, constraints=self.constraints, data=out_data)
+        comp: MassComposition = self.sub(other=out, name=name_2)
+
+        self._post_process_split(out, comp, name_1, name_2)
+
+        return out, comp
+
     def calculate_partition(self, ref: 'MassComposition') -> pd.DataFrame:
         """Calculate the partition of the ref stream relative to self"""
         self._check_one_dim_interval()
@@ -578,7 +641,7 @@ def resample_1d(self, interval_edges: Union[Iterable, int],
                                                             include_original_edges=include_original_edges)
 
         obj: MassComposition = MassComposition(df_upsampled, name=self.name)
-        obj.nodes = self._nodes
+        obj._nodes = self._nodes
         obj.constraints = self.constraints
         return obj
 
@@ -991,7 +1054,7 @@ def __sub__(self, other: 'MassComposition') -> 'MassComposition':
         res: MassComposition = MassComposition(name=xr_sub.mc.name, constraints=self.constraints)
         res.set_data(data=xr_sub, constraints=self.constraints)
 
-        res.nodes = [self._nodes[1], random_int()]
+        res._nodes = [self._nodes[1], random_int()]
         return res
 
     def __truediv__(self, other: 'MassComposition') -> 'MassComposition':
@@ -1050,6 +1113,7 @@ def _post_process_split(self, obj_1, obj_2, name_1, name_2):
         obj_2._nodes = [self._nodes[1], random_int()]
         obj_1._name = name_1
         obj_2._name = name_2
+
         return obj_1, obj_2
 
     def _intervals_to_columns(self, interval_index: pd.IntervalIndex) -> pd.DataFrame:
diff --git a/elphick/mass_composition/mc_xarray.py b/elphick/mass_composition/mc_xarray.py
@@ -269,15 +269,15 @@ def split(self, fraction: float) -> Tuple[xr.Dataset, xr.Dataset]:
 
         return out._obj, comp._obj
 
-    def apply_partition(self, definition: Callable) -> Tuple[xr.Dataset, xr.Dataset]:
+    def split_by_partition(self, partition_definition: Callable) -> Tuple[xr.Dataset, xr.Dataset]:
         """Partition the object along a given dimension.
 
         This method applies the defined partition resulting in two new objects.
 
         See also: split
 
         Args:
-            definition: A partition function that defines the efficiency of separation along a dimension
+            partition_definition: A partition function that defines the efficiency of separation along a dimension
 
         Returns:
             tuple of two datasets, the first defined by the function, the other the complement
@@ -288,13 +288,13 @@ def apply_partition(self, definition: Callable) -> Tuple[xr.Dataset, xr.Dataset]
         out = deepcopy(self)
         comp = deepcopy(self)
 
-        if not isinstance(definition, Callable):
+        if not isinstance(partition_definition, Callable):
             raise TypeError("The definition is not a callable function")
-        if 'dim' not in definition.keywords.keys():
+        if 'dim' not in partition_definition.keywords.keys():
             raise NotImplementedError("The callable function passed does not have a dim")
 
-        dim = definition.keywords['dim']
-        definition.keywords.pop('dim')
+        dim = partition_definition.keywords['dim']
+        partition_definition.keywords.pop('dim')
         if isinstance(self._obj[dim].data[0], pd.Interval):
             if dim == 'size':
                 x = mean_size(pd.arrays.IntervalArray(self._obj[dim].data))
@@ -306,7 +306,7 @@ def apply_partition(self, definition: Callable) -> Tuple[xr.Dataset, xr.Dataset]
                                  'not an interval. This is not typical usage.  It is assumed that the '
                                  'dimension data represents the centre/mean, and not an edge like '
                                  'retained or passing.')
-        pn = definition(x)
+        pn = partition_definition(x)
         if not ((dim in self._obj.dims) and (len(self._obj.dims) == 1)):
             # TODO: Set the dim to match the partition if it does not already
             # obj_mass = obj_mass.swap_dims(dim=)
diff --git a/elphick/mass_composition/stream.py b/elphick/mass_composition/stream.py
@@ -1,4 +1,4 @@
-from typing import Optional, Callable
+from typing import Optional, Callable, Generator
 
 from elphick.mass_composition import MassComposition
 
@@ -47,17 +47,18 @@ def split(self, fraction: float,
         mc1, mc2 = super().split(fraction, name_1, name_2)
         return Stream.from_mass_composition(mc1), Stream.from_mass_composition(mc2)
 
-    def apply_partition(self, definition: Callable,
-                        name_1: Optional[str] = None, name_2: Optional[str] = None) -> tuple['Stream', 'Stream']:
+    def split_by_partition(self, partition_definition: Callable,
+                           name_1: Optional[str] = None, name_2: Optional[str] = None) -> Generator[
+        'Stream', None, None]:
         """
         Partition the object along a given dimension.
 
         This method applies the defined separation resulting in two new objects.
 
-        See also: split
+        See also: split, split_by_function
 
         Args:
-            definition: A partition function that defines the efficiency of separation along a dimension
+            partition_definition: A partition function that defines the efficiency of separation along a dimension
             name_1: The name of the reference stream created by the split
             name_2: The name of the complement stream created by the split
 
@@ -66,8 +67,56 @@ def apply_partition(self, definition: Callable,
 
 
         """
-        mcs = super().apply_partition(definition, name_1, name_2)
-        return (Stream.from_mass_composition(mc) for mc in mcs)
+        streams = super().split_by_partition(partition_definition, name_1, name_2)
+        return (Stream.from_mass_composition(stream) for stream in streams)
+
+    def split_by_function(self, split_function: Callable,
+                          name_1: Optional[str] = None,
+                          name_2: Optional[str] = None) -> Generator['Stream', None, None]:
+        """Split an object using a function.
+
+        This method applies the function to self, resulting in two new objects. The object returned with name_1
+        is the result of the function.  The object returned with name_2 is the complement.
+
+        See also: split, split_by_estimator, split_by_partition
+
+        Args:
+            split_function: Any function that transforms the dataframe from a MassComposition object into a new
+             dataframe with values representing a new (output) stream.  The returned dataframe structure must be
+             identical to the input dataframe.
+            name_1: The name of the stream created by the function
+            name_2: The name of the complement stream created by the split, which is calculated automatically.
+
+        Returns:
+            A generator of two Streams,
+
+
+        """
+        streams = super().split_by_function(split_function, name_1, name_2)
+        return (Stream.from_mass_composition(stream) for stream in streams)
+
+    def split_by_estimator(self, estimator: 'sklearn.base.BaseEstimator',
+                           name_1: Optional[str] = None,
+                           name_2: Optional[str] = None) -> Generator['Stream', None, None]:
+        """Split an object using a sklearn estimator.
+
+        This method applies the function to self, resulting in two new objects. The object returned with name_1
+        is the result of the estimator.predict() method.  The object returned with name_2 is the complement.
+
+        See also: split, split_by_function, split_by_partition
+
+        Args:
+            estimator: Any sklearn estimator that transforms the dataframe from a MassComposition object into a new
+             dataframe with values representing a new (output) stream using the predict method.  The returned
+             dataframe structure must be identical to the input dataframe.
+            name_1: The name of the stream created by the estimator.
+            name_2: The name of the complement stream created by the split, which is calculated automatically.
+
+        Returns:
+            tuple of two datasets, the first with the mass fraction specified, the other the complement
+        """
+        streams = super().split_by_estimator(estimator, name_1, name_2)
+        return (Stream.from_mass_composition(stream) for stream in streams)
 
     def add(self, other: 'Stream', name: Optional[str] = None) -> 'Stream':
         """
diff --git a/examples/109_split_and_partition.py b/examples/109_split_and_partition.py
@@ -101,7 +101,7 @@
 #
 # Separate the object using the defined partition
 
-mc_coarse, mc_fine = mc_size.apply_partition(definition=partition)
+mc_coarse, mc_fine = mc_size.split_by_partition(partition_definition=partition)
 mc_coarse.name = 'coarse'
 mc_fine.name = 'fine'
 
diff --git a/examples/113_partition_models.py b/examples/113_partition_models.py
@@ -54,7 +54,7 @@
 # %%
 # Separate the object using the defined partitions.  UF = Underflow, OF = Overflow
 
-mc_uf, mc_of = mc_feed.apply_partition(definition=part_cyclone, name_1='underflow', name_2='overflow')
+mc_uf, mc_of = mc_feed.split_by_partition(partition_definition=part_cyclone, name_1='underflow', name_2='overflow')
 fs: Flowsheet = Flowsheet().from_streams([mc_feed, mc_uf, mc_of])
 
 fig = fs.table_plot(table_pos='left',
diff --git a/examples/114_compare_partitions.py b/examples/114_compare_partitions.py
@@ -79,9 +79,9 @@
 #
 # Separate the object using the defined partitions
 
-mc_ideal_coarse, mc_ideal_fine = mc_ideal_feed.apply_partition(definition=part_ideal,
-                                                               name_1='ideal_coarse', name_2='ideal_fine')
-mc_sim_coarse, mc_sim_fine = mc_sim_feed.apply_partition(definition=part_sim, name_1='sim_coarse', name_2='sim_fine')
+mc_ideal_coarse, mc_ideal_fine = mc_ideal_feed.split_by_partition(partition_definition=part_ideal,
+                                                                  name_1='ideal_coarse', name_2='ideal_fine')
+mc_sim_coarse, mc_sim_fine = mc_sim_feed.split_by_partition(partition_definition=part_sim, name_1='sim_coarse', name_2='sim_fine')
 
 
 fs: Flowsheet = Flowsheet().from_streams([mc_size, mc_ideal_feed, mc_sim_feed,
diff --git a/examples/202_interval_data_advanced.py b/examples/202_interval_data_advanced.py
@@ -47,7 +47,7 @@
 # We partially initialise a partition function, and split the feed stream accordingly.
 
 partition = partial(napier_munn, d50=0.150, ep=0.05, dim='size')
-mc_oversize, mc_undersize = mc_feed.apply_partition(definition=partition, name_1='OS', name_2='US')
+mc_oversize, mc_undersize = mc_feed.split_by_partition(partition_definition=partition, name_1='OS', name_2='US')
 
 # %%
 # Drop the two size fractions from mc_fine that have near zero mass.
diff --git a/examples/400_mass_balancing.py b/examples/400_mass_balancing.py
@@ -60,7 +60,7 @@
 #
 # Separate the object using the defined partition
 
-mc_coarse, mc_fine = mc_size.apply_partition(definition=partition, name_1='coarse', name_2='fine')
+mc_coarse, mc_fine = mc_size.split_by_partition(partition_definition=partition, name_1='coarse', name_2='fine')
 
 fs: Flowsheet = Flowsheet().from_streams([mc_size, mc_coarse, mc_fine])
 print(fs.balanced)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/fixtures.py b/tests/fixtures.py
diff --git a/tests/test_101_network_indexes.py b/tests/test_101_network_indexes.py
diff --git a/tests/test_102_splitting.py b/tests/test_102_splitting.py

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@`
`101`	`101`	`#`
`102`	`102`	`# Separate the object using the defined partition`
`103`	`103`
`104`		`-mc_coarse, mc_fine = mc_size.apply_partition(definition=partition)`
	`104`	`+mc_coarse, mc_fine = mc_size.split_by_partition(partition_definition=partition)`
`105`	`105`	`mc_coarse.name = 'coarse'`
`106`	`106`	`mc_fine.name = 'fine'`
`107`	`107`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@`
`60`	`60`	`#`
`61`	`61`	`# Separate the object using the defined partition`
`62`	`62`
`63`		`-mc_coarse, mc_fine = mc_size.apply_partition(definition=partition, name_1='coarse', name_2='fine')`
	`63`	`+mc_coarse, mc_fine = mc_size.split_by_partition(partition_definition=partition, name_1='coarse', name_2='fine')`
`64`	`64`
`65`	`65`	`fs: Flowsheet = Flowsheet().from_streams([mc_size, mc_coarse, mc_fine])`
`66`	`66`	`print(fs.balanced)`