Skip to content

Commit 99e754c

Browse files
committed
closes #173. Improved prefix management. Requested functionality already exists.
1 parent c4c0e8a commit 99e754c

File tree

7 files changed

+115
-41
lines changed

7 files changed

+115
-41
lines changed

CHANGELOG.rst

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,15 @@
1-
Mass_Composition 0.6.5 (2024-05-18)
1+
Mass_Composition 0.6.7 (2024-05-19)
2+
===================================
3+
4+
Feature
5+
-------
6+
7+
- Improved prefix management on mc init from dataframe
8+
- Added cleaning code to iron_ore_met_sample_data" method in sample_data
9+
- No changes need to provide additional features to split_by_estimator since attr vars are already
10+
passed to outputs of math operations. (#172)
11+
12+
Mass_Composition 0.6.6 (2024-05-18)
213
===================================
314

415
Feature

elphick/mass_composition/dag.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ def add_input(self, name: str) -> 'DAG':
6767

6868
def add_step(self, name: str, operation: Callable, streams: List[str], kwargs: dict = None,
6969
defined: bool = True) -> 'DAG':
70+
if name in self.all_nodes_:
71+
raise ValueError(f"A step with the name '{name}' already exists.")
7072
# Determine dependencies from the input streams
7173
dependencies = [self.stream_parent_node[stream] for stream in streams]
7274
self.graph.add_node(name, operation=operation, dependencies=dependencies, kwargs=kwargs, defined=defined)

elphick/mass_composition/datasets/sample_data.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,16 @@ def iron_ore_met_sample_data() -> pd.DataFrame:
154154
col.strip().lower().replace(' ', '_').replace('(', '').replace(')', '').replace('%', 'pct').replace('__', '_')
155155
for
156156
col in df_met.columns]
157+
158+
# clean up some values and types
159+
df_met = df_met.replace('-', np.nan).replace('#VALUE!', np.nan)
160+
head_cols: List[str] = [col for col in df_met.columns if 'head' in col]
161+
df_met[head_cols] = df_met[head_cols].astype('float64')
162+
df_met['bulk_hole_no'] = df_met['bulk_hole_no'].astype('category')
163+
df_met['sample_number'] = df_met['sample_number'].astype('int64')
164+
df_met.set_index('sample_number', inplace=True)
165+
166+
# moves suffixes to prefix
157167
df_met = df_met.pipe(_move_suffix_to_prefix, '_head')
158168
df_met = df_met.pipe(_move_suffix_to_prefix, '_lump')
159169
return df_met

elphick/mass_composition/mass_composition.py

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import os
3+
from collections import Counter
34
from copy import deepcopy
45
from pathlib import Path
56
from typing import Dict, List, Optional, Union, Tuple, Iterable, Callable, Set, Literal, Any
@@ -75,14 +76,36 @@ def __init__(self,
7576
self.status: Optional[Status] = None
7677

7778
if data is not None:
79+
data = deepcopy(data) # preserve the incoming data variable.
7880
self.set_data(data, constraints=constraints)
7981

8082
@staticmethod
8183
def _strip_common_prefix(df: pd.DataFrame) -> Tuple[pd.DataFrame, str]:
82-
common_prefix = os.path.commonprefix(df.columns.to_list())
83-
stripped_df = df.copy()
84-
stripped_df.columns = [col.replace(common_prefix, '') for col in df.columns]
85-
return stripped_df, common_prefix
84+
# Extract prefixes
85+
common_prefix = MassComposition.get_common_prefix(df.columns.to_list())
86+
87+
res = df
88+
# Create a copy of the dataframe and strip the most common prefix from column names
89+
if common_prefix:
90+
res = df.copy()
91+
res.columns = [col.replace(common_prefix + '_', '') if col.startswith(common_prefix) else col for col in
92+
df.columns]
93+
94+
return res, common_prefix
95+
96+
@staticmethod
97+
def get_common_prefix(columns: List[str]) -> str:
98+
prefixes = [col.split('_')[0] for col in columns]
99+
# Count the frequency of each prefix
100+
prefix_counter = Counter(prefixes)
101+
# Check if prefix_counter is not empty
102+
if prefix_counter:
103+
# Find the most common prefix
104+
common_prefix, freq = prefix_counter.most_common(1)[0]
105+
# Only return the prefix if its frequency is 3 or more
106+
if freq >= 3:
107+
return common_prefix
108+
return ""
86109

87110
def set_data(self, data: Union[pd.DataFrame, xr.Dataset],
88111
constraints: Optional[Dict[str, List]] = None):
@@ -104,7 +127,8 @@ def set_data(self, data: Union[pd.DataFrame, xr.Dataset],
104127
# seek a prefix to self assign the name
105128
data, common_prefix = self._strip_common_prefix(data)
106129
if common_prefix:
107-
self._specified_columns = {k: v.replace(common_prefix, '') for k, v in self._specified_columns.items()
130+
self._specified_columns = {k: v.replace(f"{common_prefix}_", '') for k, v in
131+
self._specified_columns.items()
108132
if v is not None}
109133

110134
self.variables = Variables(config=self.config['vars'],
@@ -630,24 +654,24 @@ def split_by_estimator(self,
630654
"""
631655
# Extract feature names from the estimator, and get the actual features
632656
feature_names: list[str] = list(extract_feature_names(estimator))
633-
features: pd.DataFrame = self._get_features(feature_names, extra_features, allow_prefix_mismatch)
657+
features: pd.DataFrame = self._get_features(feature_names, allow_prefix_mismatch=allow_prefix_mismatch,
658+
extra_features=extra_features)
634659

635660
# Apply the estimator
636661
estimates: pd.DataFrame = estimator.predict(X=features)
637662
if isinstance(estimates, np.ndarray):
638663
raise NotImplementedError("The estimator must return a DataFrame")
639664

640665
# Detect a possible prefix from the estimate columns
641-
features_prefix: str = os.path.commonprefix(features.columns.to_list())
642-
estimates_prefix: str = os.path.commonprefix(estimates.columns.to_list())
666+
features_prefix: str = self.get_common_prefix(features.columns.to_list())
667+
estimates_prefix: str = self.get_common_prefix(estimates.columns.to_list())
643668

644669
# If there is a prefix, check that it matches name_1, subject to allow_prefix_mismatch
645-
if estimates_prefix.strip(
646-
'_') and not allow_prefix_mismatch and name_1 and not name_1 == estimates_prefix.strip('_'):
670+
if estimates_prefix and not allow_prefix_mismatch and name_1 and not name_1 == estimates_prefix:
647671
raise ValueError(f"Common prefix mismatch: {features_prefix} and name_1: {name_1}")
648672

649673
# assign the output names, based on specified names, allow for prefix mismatch
650-
name_1 = name_1 if name_1 else estimates_prefix.strip('_')
674+
name_1 = name_1 if name_1 else estimates_prefix
651675

652676
if mass_recovery_column:
653677
# Transform the mass recovery to mass by applying the mass recovery to the dry mass of the input stream
@@ -661,7 +685,9 @@ def split_by_estimator(self,
661685
dry_mass_var].values / mass_recovery_max
662686
estimates.rename(columns={mass_recovery_column: dry_mass_var}, inplace=True)
663687

664-
estimates.columns = [f.replace(estimates_prefix, "") for f in estimates.columns]
688+
if estimates_prefix:
689+
col_name_map: dict[str, str] = {f: f.replace(estimates_prefix + '_', "") for f in estimates.columns}
690+
estimates.rename(columns=col_name_map, inplace=True)
665691

666692
out: MassComposition = MassComposition(name=name_1, constraints=self.constraints, data=estimates)
667693
comp: MassComposition = self.sub(other=out, name=name_2)
@@ -671,7 +697,7 @@ def split_by_estimator(self,
671697
return out, comp
672698

673699
def _get_features(self, feature_names: List[str], allow_prefix_mismatch: bool,
674-
extra_features: Optional[pd.DataFrame] = None,) -> pd.DataFrame:
700+
extra_features: Optional[pd.DataFrame] = None, ) -> pd.DataFrame:
675701
"""
676702
This method checks if the feature names required by an estimator are present in the data. If not, it tries to
677703
match the feature names by considering a common prefix. If a match is found, the columns in the data are renamed
@@ -696,16 +722,16 @@ def _get_features(self, feature_names: List[str], allow_prefix_mismatch: bool,
696722
feature_name_map = {name.lower(): name for name in feature_names}
697723

698724
df_features: pd.DataFrame = self.data.to_dataframe()
699-
if extra_features:
725+
if extra_features is not None:
700726
df_features = pd.concat([df_features, extra_features], axis=1)
701727

702728
missing_features = set(f.lower() for f in feature_names) - set(c.lower() for c in df_features.columns)
703729

704730
if missing_features:
705731
prefix: str = f"{self.name}_"
706-
common_prefix: str = os.path.commonprefix(feature_names)
707-
if common_prefix and common_prefix != prefix and allow_prefix_mismatch:
708-
prefix = common_prefix
732+
common_prefix: str = self.get_common_prefix(feature_names)
733+
if common_prefix and common_prefix + '_' != prefix and allow_prefix_mismatch:
734+
prefix = common_prefix + '_'
709735

710736
# create a map to support renaming the columns
711737
prefixed_feature_map: dict[str, str] = {f: feature_name_map.get(f"{prefix}{f.lower()}") for f in

examples/504_dag_with_estimator.py

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
# This import at the top to guard against the estimator extras not being installed
1818
from elphick.mass_composition.utils.sklearn import PandasPipeline
1919

20-
import numpy as np
2120
import pandas as pd
2221
import plotly
2322
from sklearn.ensemble import RandomForestRegressor
@@ -39,18 +38,19 @@
3938
# ---------
4039
#
4140
# We load some metallurgical data from a drill program, REF: A072391
41+
# Since we are not concerned about the model performance in this example, we'll convert the categorical feature
42+
# bulk_hole_no to an integer
4243

4344
df: pd.DataFrame = iron_ore_met_sample_data()
4445

4546
base_components = ['fe', 'p', 'sio2', 'al2o3', 'loi']
46-
cols_x = ['dry_weight_lump_kg'] + [f'head_{comp}' for comp in base_components]
47+
cols_x = ['dry_weight_lump_kg'] + [f'head_{comp}' for comp in base_components] + ['bulk_hole_no']
4748
cols_y = ['lump_pct'] + [f'lump_{comp}' for comp in base_components]
4849

49-
# %%
50-
df = df.loc[:, ['sample_number'] + cols_x + cols_y].query('lump_pct>0').replace('-', np.nan).astype(float).dropna(
51-
how='any')
52-
df = df.rename(columns={'dry_weight_lump_kg': 'head_mass_dry'}).set_index('sample_number')
53-
df.index = df.index.astype(int)
50+
df = df.loc[:, cols_x + cols_y].query('lump_pct>0').dropna(how='any')
51+
df = df.rename(columns={'dry_weight_lump_kg': 'head_mass_dry'})
52+
df['bulk_hole_no'] = df['bulk_hole_no'].astype('category').cat.codes
53+
5454
logger.info(df.shape)
5555
df.head()
5656

@@ -86,40 +86,65 @@
8686
# ------------------------------------
8787
# Now we will create a MassComposition object and use it to apply the model to the feed stream.
8888

89-
head: MassComposition = MassComposition(data=X[[col for col in X.columns if 'head' in col]],
89+
head: MassComposition = MassComposition(data=X_test.drop(columns=['bulk_hole_no']), name='head',
9090
mass_dry_var='head_mass_dry')
9191
lump, fines = head.split_by_estimator(estimator=pipe, name_2='fines',
92-
mass_recovery_column='lump_pct', mass_recovery_max=100)
92+
mass_recovery_column='lump_pct', mass_recovery_max=100,
93+
extra_features=X_test['bulk_hole_no'])
94+
lump.data.to_dataframe().head()
9395

94-
lump
9596
# %%
96-
fines
97+
fines.data.to_dataframe().head()
9798

9899
# %%
99100
# Define the DAG
100101
# --------------
101102
#
102-
# The DAG is defined by adding nodes to the graph. Each node is an input, output or Stream operation
103-
# (e.g. add, split, etc.). The nodes are connected by the streams they operate on.
103+
# First we define a simple DAG, where the feed stream is split into two streams, lump and fines.
104+
# The lump estimator requires the usual mass-composition variables plus an addition feature/variable
105+
# called `bulk_hole_no`. Since the `bulk_hole_no` is available in the feed stream, it is immediately accessible
106+
# to the estimator.
107+
108+
head: MassComposition = MassComposition(data=X_test, name='head',
109+
mass_dry_var='head_mass_dry')
104110

105-
dag = DAG(name='A072391', n_jobs=2)
111+
dag = DAG(name='A072391', n_jobs=1)
106112
dag.add_input(name='head')
107113
dag.add_step(name='screen', operation=Stream.split_by_estimator, streams=['head'],
108114
kwargs={'estimator': pipe, 'name_1': 'lump', 'name_2': 'fines',
109115
'mass_recovery_column': 'lump_pct', 'mass_recovery_max': 100})
110116
dag.add_output(name='lump', stream='lump')
111117
dag.add_output(name='fines', stream='fines')
118+
dag.run(input_streams={'head': head}, progress_bar=True)
119+
120+
fig = Flowsheet.from_dag(dag).plot_network()
121+
fig
112122

113123
# %%
114-
# Run the DAG
115-
# -----------
124+
# More Complex DAG
125+
# ----------------
126+
# This DAG is to test a more complex flowsheet where the estimator may have all the features
127+
# immediately available in the parent stream.
116128
#
117-
# The dag is run by providing a Stream object for the input.
129+
# .. note::
130+
# This example works, but it does so since all attribute (extra) variables are passed all the way around
131+
# the network in the current design. This is to be changed in the future to allow for more efficient processing.
132+
# Once attributes are no longer passed, changes will be needed to the DAG to marshall
133+
# features from other streams in the network (most often the input stream).
118134

119-
dag.run({'head': head}, progress_bar=True)
120-
121-
# %%
122-
# Create a Flowsheet object from the dag, enabling all the usual network plotting and analysis methods.
135+
dag = DAG(name='A072391', n_jobs=1)
136+
dag.add_input(name='head')
137+
dag.add_step(name='screen', operation=Stream.split_by_estimator, streams=['head'],
138+
kwargs={'estimator': pipe, 'name_1': 'lump', 'name_2': 'fines',
139+
'mass_recovery_column': 'lump_pct', 'mass_recovery_max': 100})
140+
dag.add_step(name='screen_2', operation=Stream.split_by_estimator, streams=['fines'],
141+
kwargs={'estimator': pipe, 'name_1': 'lump_2', 'name_2': 'fines_2',
142+
'mass_recovery_column': 'lump_pct', 'mass_recovery_max': 100,
143+
'allow_prefix_mismatch': True})
144+
dag.add_output(name='lump', stream='lump_2')
145+
dag.add_output(name='fines', stream='fines_2')
146+
dag.add_output(name='stockpile', stream='lump')
147+
dag.run(input_streams={'head': head}, progress_bar=True)
123148

124149
fs: Flowsheet = Flowsheet.from_dag(dag)
125150

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "mass-composition"
3-
version = "0.6.6"
3+
version = "0.6.7"
44
description = "For managing multi-dimensional mass-composition datasets, supporting weighted mathematical operations and visualisation."
55
authors = ["Greg <greg@elphick.com.au>"]
66
packages = [{ include = "elphick/mass_composition" }]

tests/test_names.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ def test_names(demo_data):
1111
xr_data: xr.Dataset = obj_mc._data
1212
df_export: pd.DataFrame = xr_data.mc.to_dataframe(original_column_names=True)
1313
for col in demo_data.columns:
14-
assert col in list(df_export.columns), f'{col} is not in {list(demo_data.columns)}'
14+
assert col in df_export.columns.to_list(), f'{col} is not in {demo_data.columns.to_list()}'

0 commit comments

Comments
 (0)