Skip to content

Commit

Permalink
Fix WorkFlow.predict method for MAPIE return_std to match one standar…
Browse files Browse the repository at this point in the history
…d deviation. Added arguments to specify whether to use MAPIE or min_max in analysis and utils package.
  • Loading branch information
Harrison Martin committed Jan 23, 2024
1 parent b5f0c27 commit 9be8b9b
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 32 deletions.
52 changes: 42 additions & 10 deletions LECA/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ def create_input(feature_dict: Dict[str, List[float]], steps: int=10, temp: Unio


def predict_conductivity_from_arrhenius_objectives(x_in: pd.DataFrame, wf: WorkFlow,
model: Union[str, List[str]], beta_0: float, log:bool=False
model: Union[str, List[str]], beta_0: float, log:bool=False, min_max:bool=False
) -> pd.DataFrame:
"""
Predict the ionic conductivity for given electrolyte compositions at a given temperature.
Expand Down Expand Up @@ -678,6 +678,14 @@ def predict_conductivity_from_arrhenius_objectives(x_in: pd.DataFrame, wf: WorkF
Default value ``False``
min_max: bool
Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
`min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
`min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
This parameter is moot for GPR models.
Default value ``False``.
Returns
-------
pd.DataFrame
Expand All @@ -699,7 +707,7 @@ def predict_conductivity_from_arrhenius_objectives(x_in: pd.DataFrame, wf: WorkF
inv_temp = x_input['inverse temperature']
temp_offset = x_input['inverse temperature'] - beta_0
x_input.drop('inverse temperature', axis=1, inplace=True)
pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=True, return_std=True)
pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=min_max, return_std=True)
log_cond = (unumpy.uarray(pred['S0'], pred['S0_std'])) \
- (unumpy.uarray(pred['S1'], pred['S1_std']))*(temp_offset) \
- (unumpy.uarray(pred['S2'], pred['S2_std']))*(temp_offset)**2
Expand All @@ -713,7 +721,7 @@ def predict_conductivity_from_arrhenius_objectives(x_in: pd.DataFrame, wf: WorkF
return pred

def predict_conductivity_from_log_conductivity_objective(x_in: pd.DataFrame, wf: WorkFlow,
model: str, log:bool=False, objective:str='log conductivity'
model: str, log:bool=False, objective:str='log conductivity', min_max:bool=False
) -> pd.DataFrame:
"""
Predict the ionic conductivity for given electrolyte compositions at a given temperature. This function
Expand Down Expand Up @@ -742,6 +750,14 @@ def predict_conductivity_from_log_conductivity_objective(x_in: pd.DataFrame, wf:
Default value ``"log conductivity"``
min_max: bool
Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
`min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
`min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
This parameter is moot for GPR models.
Default value ``False``.
Returns
-------
pd.DataFrame
Expand All @@ -756,7 +772,7 @@ def predict_conductivity_from_log_conductivity_objective(x_in: pd.DataFrame, wf:

x_input = x_in.copy()
inv_temp = x_input['inverse temperature']
pred = wf.predict(x_input, {objective: model}, min_max=True, return_std=True)
pred = wf.predict(x_input, {objective: model}, min_max=min_max, return_std=True)
log_cond = unumpy.uarray(pred[objective], pred[objective+'_std'])
if log == False:
cond = unumpy.pow(10,log_cond)
Expand All @@ -771,7 +787,7 @@ def plot_1D(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, List[
temperatures:Union[int, float, List[int], List[float]]=20, steps:int=50, ylim:Optional[Tuple[float,float]]=None,
multiply_by_salt:bool=False, log:bool=False,
model_labels:Optional[List[str]]=None, wf_labels:Optional[List[str]]=None,
confidence:float = 1.0, save_loc: Union[str, bool] = False, objective:str='log conductivity',
min_max:bool=False, confidence:float = 1.0, save_loc: Union[str, bool] = False, objective:str='log conductivity',
indicate_max:Tuple[Optional[str],Union[int, float],Union[int,float]]=(None,0.8, -1)) -> None:
"""
1-dimensional slice along one feature for models predicted conductivity / log(conductivity).
Expand Down Expand Up @@ -843,6 +859,14 @@ def plot_1D(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, List[
Default value ``None``
min_max: bool
Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
`min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
`min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
This parameter is moot for GPR models.
Default value ``False``.
confidence: float
Scalar value to multiply the estimated uncertainty. By default this value is ``1.0`` which results in the
plotted errorbars showing one standard-deviation. E.g. ``confidence=1.96`` would then reflect an
Expand Down Expand Up @@ -908,9 +932,9 @@ def plot_1D(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, List[
for model, model_label in zip(models, model_labels):
for wf, wf_label, beta_0 in zip(wfs, wf_labels, beta_0_list):
if beta_0 == -1:
specific_prediction = predict_conductivity_from_log_conductivity_objective(x_input, wf, model, log, objective)
specific_prediction = predict_conductivity_from_log_conductivity_objective(x_input, wf, model, log, objective, min_max=min_max)
else:
specific_prediction = predict_conductivity_from_arrhenius_objectives(x_input, wf, model, beta_0, log)
specific_prediction = predict_conductivity_from_arrhenius_objectives(x_input, wf, model, beta_0, log, min_max=min_max)

if multiply_by_salt == True:
if log == False:
Expand Down Expand Up @@ -991,7 +1015,7 @@ def plot_1D_Sx(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, Li
beta_0_list:List[float], steps:int=50, objectives:List[str]=['S0', 'S1', 'S2'],
ylim:Optional[Tuple[float,float]]=None, multiply_by_salt:bool=False,
model_labels:Optional[List[str]]=None, wf_labels:Optional[List[str]]=None,
confidence:float = 1.0, save_loc: Union[str, bool] = False) -> None:
min_max:bool=False, confidence:float = 1.0, save_loc: Union[str, bool] = False) -> None:
"""
1-dimensional slice along one feature for models predicted arrhenius objectives S0, S1 and S2.
Three plots will be rendered which show the S0, S1 and S2 predictions for the argument defined
Expand Down Expand Up @@ -1049,6 +1073,14 @@ def plot_1D_Sx(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, Li
Default value ``None``
min_max: bool
Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
`min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
`min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
This parameter is moot for GPR models.
Default value ``False``.
confidence: float
Scalar value to multiply the estimated uncertainty. By default this value is ``1.0`` which results in the
plotted errorbars showing one standard-deviation. E.g. ``confidence=1.96`` would then reflect an
Expand Down Expand Up @@ -1087,7 +1119,7 @@ def plot_1D_Sx(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, Li
else:
m1 = m2 = m3 = model
for wf, wf_label, beta_0 in zip(wfs, wf_labels, beta_0_list):
pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=True, return_std=True)
pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=min_max, return_std=True)
if multiply_by_salt == True:
pred['S0'] = pred['S0'] + np.log10(x_input['x_LiSalt'])
specific_prediction = pd.concat([x_input, pred], axis=1)
Expand Down Expand Up @@ -1346,7 +1378,7 @@ def plot_2D_Sx(wf: WorkFlow, model: Union[str, List[str]], feature_dict:Dict[str
m1 = m2 = m3 = model

# Build prediction dataframe
pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=True, return_std=True)
pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3})
if multiply_by_salt == True:
pred['S0'] = pred['S0'] + np.log10(x_input['x_LiSalt'])
specific_prediction = pd.concat([x_input, pred], axis=1)
Expand Down
38 changes: 21 additions & 17 deletions LECA/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -1496,7 +1496,7 @@ def predict(self,
min_max: bool
Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
`min_max`:``True`` takes the minimum and maximum prediction of the bootstrapped models to be the range of uncertainty.
`min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
`min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
This parameter is moot for GPR models.
Expand Down Expand Up @@ -1594,8 +1594,8 @@ def predict(self,

##otherwise use MAPIE Jackknife+-ab (i.e. include conformity scores penalty)
else:
y, std = uncert.predict(local_X, alpha=0.32)# -> 68% confidence interval == 1*sigma ASSUMING normal distribution
std = np.abs(std[:,1]-std[:,0]).ravel()
y, std = uncert.predict(local_X, alpha=0.16, ensemble=True)# -> theoretical guarantee of 68% confidence interval == +-1*sigma ASSUMING normal distribution
std = np.abs((std[:,1]-std[:,0])/2).ravel() # 68% interval / 2 -> ~1*sigma
prediction = pd.concat([prediction, pd.Series(y, name=obj), pd.Series(std, name=obj+'_std')], axis=1)

prediction.index = X.index # cast prediction indices to match the indices of the X input
Expand Down Expand Up @@ -1908,26 +1908,21 @@ def arrh_predict(x_input, log, deviate_by_salt):
x_input = x_input[self.features]
# By default take highest scoring models
if models == None:
pred = self.predict(x_input, min_max=True, return_std=True)
pred = self.predict(x_input)
else:
pred = self.predict(x_input, dict(zip(arrhenius_objectives, models)), min_max=True, return_std=True)
pred = self.predict(x_input, dict(zip(arrhenius_objectives, models)))

s0, s1, s2 = arrhenius_objectives[0], arrhenius_objectives[1], arrhenius_objectives[2]
#to compare log(conductivity/x_LiSalt)
if deviate_by_salt == True:
cond = (unumpy.uarray(pred[s0], pred[s0+'_std'])
- unumpy.uarray(pred[s1], pred[s1+'_std'])*(temp_offset)
- unumpy.uarray(pred[s2], pred[s2+'_std'])*(temp_offset)**2)
cond = pred[s0] - pred[s1]*(temp_offset) - pred[s2]*(temp_offset)**2
#to compare log conductivity
else:
cond = (unumpy.uarray(pred[s0]+np.log10(x_input['x_LiSalt']), pred[s0+'_std'])
- unumpy.uarray(pred[s1], pred[s1+'_std'])*(temp_offset)
- unumpy.uarray(pred[s2], pred[s2+'_std'])*(temp_offset)**2)
cond = pred[s0] + np.log10(x_input['x_LiSalt']) - pred[s1]*(temp_offset) - pred[s2]*(temp_offset)**2
#to compare conductivity
if log == False:
cond = unumpy.pow(10,cond) #this must be changed compared to the original function
return pd.DataFrame({original_objective:unumpy.nominal_values(cond),
original_objective+'_std':unumpy.std_devs(cond)})
cond = np.power(10,cond)
return pd.DataFrame({original_objective:cond})

fig, ax = plt.subplots(1, 1, figsize=(4, 4))
if custom_label==None:
Expand Down Expand Up @@ -2249,7 +2244,8 @@ def optimize(self, strategy:str = 'max',
obj_fn:Optional[Callable] = None,
fixed_values:Optional[Dict[str,float]] = None,
bounds:Optional[Dict[str,Tuple[float]]] = None,
n_restarts_optimizer:int = 100) -> pd.DataFrame:
n_restarts_optimizer:int = 100,
min_max:bool = False) -> pd.DataFrame:
"""
Optimizer to search design space for max/min objective value, bayesian expected improvement, upper/lower confidence bound and maximum uncertainty strategies. Returns optimal input feature set to query for given strategy.
Expand Down Expand Up @@ -2286,6 +2282,14 @@ def optimize(self, strategy:str = 'max',
Number of random points in the design space from which the acquisition function will be optimized. Higher -> more computationally expensive, but higher chance of finding global best acquisition point.
Default value ``100``.
min_max: bool
Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
`min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
`min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
This parameter is moot for GPR models.
Default value ``False``.
Returns
-------
Expand Down Expand Up @@ -2314,7 +2318,7 @@ def optimize(self, strategy:str = 'max',
# Calculate optimal sample for each objective function
for obj in objectives:
def obj_fn(x):
return self.predict(x, obj, X_scaled=False, min_max=True,return_std=True)
return self.predict(x, obj, X_scaled=False, min_max=min_max,return_std=True)

obj_optimal_sample = self.optimize(strategy=strategy,
obj_fn=obj_fn,
Expand Down Expand Up @@ -2425,7 +2429,7 @@ def f(x):
def _optimizer(self, f, bounds, n_restarts_optimizer:int = 100):
min_list = []
# Brute force here to avoid local minima. Generate random in-bounds x0 values to attempt minimization
x0_array = np.array([np.random.uniform(*min_max, n_restarts_optimizer) for min_max in bounds.values()]).T
x0_array = np.array([np.random.uniform(*min_max_bounds, n_restarts_optimizer) for min_max_bounds in bounds.values()]).T
for x0 in x0_array:
res = opt.minimize(fun=f,
x0=x0,
Expand Down
10 changes: 5 additions & 5 deletions LECA/notebook_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def data_vis(change=None):
return output
#
#
def interactive_plot(wf, temps, prediction_fn):
def interactive_plot(wf, temps, prediction_fn, confidence:float=1.0):
# Initialize plot figure
fig = plt.figure(figsize=(6,3))
ax = fig.add_subplot(111)
Expand Down Expand Up @@ -120,7 +120,7 @@ def update(change=None):
fixed_values = fixed_values[:-2]

prediction = prediction_fn(x_input)
ax.errorbar(feature_range,prediction['conductivity'],yerr=1.96*(prediction['conductivity_std']),xerr=None,label=str(temp) + r'$\degree$C',alpha=1,capsize=3)
ax.errorbar(feature_range,prediction['conductivity'],yerr=confidence*(prediction['conductivity_std']),xerr=None,label=str(temp) + r'$\degree$C',alpha=1,capsize=3)
obj=r'$\sigma$'
dynamic_plot(obj, feature_var, fixed_values)

Expand Down Expand Up @@ -253,7 +253,7 @@ def on_button_clicked(_):
a = widgets.VBox([out, ui])
display(a)

def prediction_plot_2d(wf, model_name, f): # Todo: generalize this for 2d regression problems (feature/obj::X/y)
def prediction_plot_2d(wf, model_name, f, min_max:bool=False, confidence:float=1.0): # Todo: generalize this for 2d regression problems (feature/obj::X/y)

# Initialize plot figure
fig = plt.figure(figsize=(6,4))
Expand All @@ -263,12 +263,12 @@ def prediction_plot_2d(wf, model_name, f): # Todo: generalize this for 2d regres
n_samples = 100
X = np.linspace(-2, 10, n_samples)
X_df = pd.DataFrame(X, columns=['x'])
pred = wf.predict(X_df, {'y':model_name}, min_max=True, return_std=True)
pred = wf.predict(X_df, {'y':model_name}, min_max=min_max, return_std=True)

ax.fill_between(X, pred['y']-pred['y_std'], pred['y']+pred['y_std'],
label=r'$\pm \sigma$', color='orange', alpha=0.5, linestyle='-')

ax.errorbar(wf.X_unscaled['x'],wf.y['y'],yerr=1.96*wf.std['y_std'],xerr=None,label='Training Data', fmt='.', c='black', alpha=0.5,capsize=1)
ax.errorbar(wf.X_unscaled['x'],wf.y['y'],yerr=confidence*wf.std['y_std'],xerr=None,label='Training Data', fmt='.', c='black', alpha=0.5,capsize=1)

ax.errorbar(X,f(X),label='f(x)', fmt='--', alpha=1)
ax.errorbar(X,pred['y'],label='Prediction', color='r', alpha=1)
Expand Down

0 comments on commit 9be8b9b

Please sign in to comment.