Fix WorkFlow.predict method for MAPIE return_std to match one standar…

…d deviation. Added arguments to specify whether to use MAPIE or min_max in analysis and utils package.
Harrison-Teeg · Jan 23, 2024 · 9be8b9b · 9be8b9b
1 parent b5f0c27
commit 9be8b9b
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 32 deletions.
diff --git a/LECA/analyze.py b/LECA/analyze.py
@@ -649,7 +649,7 @@ def create_input(feature_dict: Dict[str, List[float]], steps: int=10, temp: Unio
 
 
 def predict_conductivity_from_arrhenius_objectives(x_in: pd.DataFrame, wf: WorkFlow,
-        model: Union[str, List[str]], beta_0: float, log:bool=False
+        model: Union[str, List[str]], beta_0: float, log:bool=False, min_max:bool=False
         ) -> pd.DataFrame:
     """
     Predict the ionic conductivity for given electrolyte compositions at a given temperature.
@@ -678,6 +678,14 @@ def predict_conductivity_from_arrhenius_objectives(x_in: pd.DataFrame, wf: WorkF
 
         Default value ``False``
 
+    min_max: bool
+        Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
+        `min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
+        `min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
+        This parameter is moot for GPR models.
+
+        Default value ``False``.
+
     Returns
     -------
         pd.DataFrame
@@ -699,7 +707,7 @@ def predict_conductivity_from_arrhenius_objectives(x_in: pd.DataFrame, wf: WorkF
     inv_temp = x_input['inverse temperature']
     temp_offset = x_input['inverse temperature'] - beta_0
     x_input.drop('inverse temperature', axis=1, inplace=True)
-    pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=True, return_std=True)
+    pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=min_max, return_std=True)
     log_cond = (unumpy.uarray(pred['S0'], pred['S0_std'])) \
                 - (unumpy.uarray(pred['S1'], pred['S1_std']))*(temp_offset) \
                 - (unumpy.uarray(pred['S2'], pred['S2_std']))*(temp_offset)**2
@@ -713,7 +721,7 @@ def predict_conductivity_from_arrhenius_objectives(x_in: pd.DataFrame, wf: WorkF
     return pred
 
 def predict_conductivity_from_log_conductivity_objective(x_in: pd.DataFrame, wf: WorkFlow, 
-        model: str, log:bool=False, objective:str='log conductivity'
+        model: str, log:bool=False, objective:str='log conductivity', min_max:bool=False
         ) -> pd.DataFrame:
     """
     Predict the ionic conductivity for given electrolyte compositions at a given temperature. This function
@@ -742,6 +750,14 @@ def predict_conductivity_from_log_conductivity_objective(x_in: pd.DataFrame, wf:
 
         Default value ``"log conductivity"``
 
+    min_max: bool
+        Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
+        `min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
+        `min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
+        This parameter is moot for GPR models.
+
+        Default value ``False``.
+
     Returns
     -------
         pd.DataFrame
@@ -756,7 +772,7 @@ def predict_conductivity_from_log_conductivity_objective(x_in: pd.DataFrame, wf:
 
     x_input = x_in.copy()
     inv_temp = x_input['inverse temperature']
-    pred = wf.predict(x_input, {objective: model}, min_max=True, return_std=True)
+    pred = wf.predict(x_input, {objective: model}, min_max=min_max, return_std=True)
     log_cond = unumpy.uarray(pred[objective], pred[objective+'_std'])
     if log == False:
         cond = unumpy.pow(10,log_cond)
@@ -771,7 +787,7 @@ def plot_1D(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, List[
         temperatures:Union[int, float, List[int], List[float]]=20, steps:int=50, ylim:Optional[Tuple[float,float]]=None, 
         multiply_by_salt:bool=False, log:bool=False, 
         model_labels:Optional[List[str]]=None, wf_labels:Optional[List[str]]=None, 
-        confidence:float = 1.0, save_loc: Union[str, bool] = False, objective:str='log conductivity',
+        min_max:bool=False, confidence:float = 1.0, save_loc: Union[str, bool] = False, objective:str='log conductivity',
         indicate_max:Tuple[Optional[str],Union[int, float],Union[int,float]]=(None,0.8, -1)) -> None:
     """
         1-dimensional slice along one feature for models predicted conductivity / log(conductivity). 
@@ -843,6 +859,14 @@ def plot_1D(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, List[
 
         Default value ``None``
 
+    min_max: bool
+        Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
+        `min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
+        `min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
+        This parameter is moot for GPR models.
+
+        Default value ``False``.
+
     confidence: float
         Scalar value to multiply the estimated uncertainty. By default this value is ``1.0`` which results in the
         plotted errorbars showing one standard-deviation. E.g. ``confidence=1.96`` would then reflect an
@@ -908,9 +932,9 @@ def plot_1D(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, List[
         for model, model_label in zip(models, model_labels):
             for wf, wf_label, beta_0 in zip(wfs, wf_labels, beta_0_list):
                 if beta_0 == -1:
-                    specific_prediction = predict_conductivity_from_log_conductivity_objective(x_input, wf, model, log, objective)
+                    specific_prediction = predict_conductivity_from_log_conductivity_objective(x_input, wf, model, log, objective, min_max=min_max)
                 else:
-                    specific_prediction = predict_conductivity_from_arrhenius_objectives(x_input, wf, model, beta_0, log)
+                    specific_prediction = predict_conductivity_from_arrhenius_objectives(x_input, wf, model, beta_0, log, min_max=min_max)
 
                 if multiply_by_salt == True:
                     if log == False:
@@ -991,7 +1015,7 @@ def plot_1D_Sx(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, Li
         beta_0_list:List[float], steps:int=50, objectives:List[str]=['S0', 'S1', 'S2'], 
         ylim:Optional[Tuple[float,float]]=None, multiply_by_salt:bool=False, 
         model_labels:Optional[List[str]]=None, wf_labels:Optional[List[str]]=None, 
-        confidence:float = 1.0, save_loc: Union[str, bool] = False) -> None:
+        min_max:bool=False, confidence:float = 1.0, save_loc: Union[str, bool] = False) -> None:
     """
         1-dimensional slice along one feature for models predicted arrhenius objectives S0, S1 and S2.
         Three plots will be rendered which show the S0, S1 and S2 predictions for the argument defined
@@ -1049,6 +1073,14 @@ def plot_1D_Sx(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, Li
 
         Default value ``None``
 
+    min_max: bool
+        Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
+        `min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
+        `min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
+        This parameter is moot for GPR models.
+
+        Default value ``False``.
+
     confidence: float
         Scalar value to multiply the estimated uncertainty. By default this value is ``1.0`` which results in the
         plotted errorbars showing one standard-deviation. E.g. ``confidence=1.96`` would then reflect an
@@ -1087,7 +1119,7 @@ def plot_1D_Sx(wfs: List[WorkFlow], models: List[str], feature_dict:Dict[str, Li
         else:
             m1 = m2 = m3 = model
         for wf, wf_label, beta_0 in zip(wfs, wf_labels, beta_0_list):
-            pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=True, return_std=True)
+            pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=min_max, return_std=True)
             if multiply_by_salt == True:
                 pred['S0'] = pred['S0'] + np.log10(x_input['x_LiSalt'])
             specific_prediction = pd.concat([x_input, pred], axis=1)
@@ -1346,7 +1378,7 @@ def plot_2D_Sx(wf: WorkFlow, model: Union[str, List[str]], feature_dict:Dict[str
         m1 = m2 = m3 = model
 
     # Build prediction dataframe
-    pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3}, min_max=True, return_std=True)
+    pred = wf.predict(x_input, {'S0': m1, 'S1': m2, 'S2': m3})
     if multiply_by_salt == True:
         pred['S0'] = pred['S0'] + np.log10(x_input['x_LiSalt'])
     specific_prediction = pd.concat([x_input, pred], axis=1)

diff --git a/LECA/fit.py b/LECA/fit.py
@@ -1496,7 +1496,7 @@ def predict(self,
 
         min_max: bool
             Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
-            `min_max`:``True`` takes the minimum and maximum prediction of the bootstrapped models to be the range of uncertainty.
+            `min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
             `min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
             This parameter is moot for GPR models.
 
@@ -1594,8 +1594,8 @@ def predict(self,
 
                 ##otherwise use MAPIE Jackknife+-ab (i.e. include conformity scores penalty)
                 else:
-                    y, std = uncert.predict(local_X, alpha=0.32)# -> 68% confidence interval == 1*sigma ASSUMING normal distribution
-                    std = np.abs(std[:,1]-std[:,0]).ravel()
+                    y, std = uncert.predict(local_X, alpha=0.16, ensemble=True)# -> theoretical guarantee of 68% confidence interval == +-1*sigma ASSUMING normal distribution
+                    std = np.abs((std[:,1]-std[:,0])/2).ravel() # 68% interval / 2 -> ~1*sigma
             prediction = pd.concat([prediction, pd.Series(y, name=obj), pd.Series(std, name=obj+'_std')], axis=1)
 
         prediction.index = X.index # cast prediction indices to match the indices of the X input
@@ -1908,26 +1908,21 @@ def arrh_predict(x_input, log, deviate_by_salt):
             x_input = x_input[self.features]
             # By default take highest scoring models
             if models == None:
-                pred = self.predict(x_input, min_max=True, return_std=True)
+                pred = self.predict(x_input)
             else: 
-                pred = self.predict(x_input, dict(zip(arrhenius_objectives, models)), min_max=True, return_std=True)
+                pred = self.predict(x_input, dict(zip(arrhenius_objectives, models)))
 
             s0, s1, s2 = arrhenius_objectives[0], arrhenius_objectives[1], arrhenius_objectives[2]
             #to compare log(conductivity/x_LiSalt)
             if deviate_by_salt == True:
-                cond = (unumpy.uarray(pred[s0], pred[s0+'_std']) 
-                            - unumpy.uarray(pred[s1], pred[s1+'_std'])*(temp_offset)
-                            - unumpy.uarray(pred[s2], pred[s2+'_std'])*(temp_offset)**2)
+                cond = pred[s0] - pred[s1]*(temp_offset) - pred[s2]*(temp_offset)**2
             #to compare log conductivity
             else:
-                cond = (unumpy.uarray(pred[s0]+np.log10(x_input['x_LiSalt']), pred[s0+'_std']) 
-                            - unumpy.uarray(pred[s1], pred[s1+'_std'])*(temp_offset)
-                            - unumpy.uarray(pred[s2], pred[s2+'_std'])*(temp_offset)**2)
+                cond = pred[s0] + np.log10(x_input['x_LiSalt']) - pred[s1]*(temp_offset) - pred[s2]*(temp_offset)**2
             #to compare conductivity
             if log == False:
-                cond = unumpy.pow(10,cond) #this must be changed compared to the original function 
-            return pd.DataFrame({original_objective:unumpy.nominal_values(cond),
-                             original_objective+'_std':unumpy.std_devs(cond)})
+                cond = np.power(10,cond)
+            return pd.DataFrame({original_objective:cond})
 
         fig, ax = plt.subplots(1, 1, figsize=(4, 4))
         if custom_label==None:
@@ -2249,7 +2244,8 @@ def optimize(self, strategy:str = 'max',
             obj_fn:Optional[Callable] = None,
             fixed_values:Optional[Dict[str,float]] = None,
             bounds:Optional[Dict[str,Tuple[float]]] = None,
-            n_restarts_optimizer:int = 100) -> pd.DataFrame:
+            n_restarts_optimizer:int = 100,
+            min_max:bool = False) -> pd.DataFrame:
         """
         Optimizer to search design space for max/min objective value, bayesian expected improvement, upper/lower confidence bound and maximum uncertainty strategies. Returns optimal input feature set to query for given strategy.
 
@@ -2286,6 +2282,14 @@ def optimize(self, strategy:str = 'max',
             Number of random points in the design space from which the acquisition function will be optimized. Higher -> more computationally expensive, but higher chance of finding global best acquisition point.
 
             Default value ``100``.
+
+        min_max: bool
+            Whether to use the `min_max` method to estimate uncertainty, or MAPIE with conformity scores.
+            `min_max`:``True`` returns the standard deviations of the predictions from all the bootstrapped models for each point.
+            `min_max`:``False`` uses the MAPIE uncertainty estimation outlined in: `Mapie jackknife+-AB <https://mapie.readthedocs.io/en/latest/theoretical_description_regression.html#the-jackknife-after-bootstrap-method>`_
+            This parameter is moot for GPR models.
+
+            Default value ``False``.
         
         Returns
         -------
@@ -2314,7 +2318,7 @@ def optimize(self, strategy:str = 'max',
             # Calculate optimal sample for each objective function
             for obj in objectives:
                 def obj_fn(x):
-                    return self.predict(x, obj, X_scaled=False, min_max=True,return_std=True)
+                    return self.predict(x, obj, X_scaled=False, min_max=min_max,return_std=True)
 
                 obj_optimal_sample = self.optimize(strategy=strategy,
                                             obj_fn=obj_fn,
@@ -2425,7 +2429,7 @@ def f(x):
     def _optimizer(self, f, bounds, n_restarts_optimizer:int = 100):
         min_list = []
         # Brute force here to avoid local minima. Generate random in-bounds x0 values to attempt minimization
-        x0_array = np.array([np.random.uniform(*min_max, n_restarts_optimizer) for min_max in bounds.values()]).T
+        x0_array = np.array([np.random.uniform(*min_max_bounds, n_restarts_optimizer) for min_max_bounds in bounds.values()]).T
         for x0 in x0_array:
             res = opt.minimize(fun=f,
                     x0=x0,

diff --git a/LECA/notebook_utils.py b/LECA/notebook_utils.py
@@ -63,7 +63,7 @@ def data_vis(change=None):
     return output
 #
 #
-def interactive_plot(wf, temps, prediction_fn):
+def interactive_plot(wf, temps, prediction_fn, confidence:float=1.0):
     # Initialize plot figure
     fig = plt.figure(figsize=(6,3))
     ax = fig.add_subplot(111)
@@ -120,7 +120,7 @@ def update(change=None):
             fixed_values = fixed_values[:-2]
 
             prediction = prediction_fn(x_input)
-            ax.errorbar(feature_range,prediction['conductivity'],yerr=1.96*(prediction['conductivity_std']),xerr=None,label=str(temp)  + r'$\degree$C',alpha=1,capsize=3)
+            ax.errorbar(feature_range,prediction['conductivity'],yerr=confidence*(prediction['conductivity_std']),xerr=None,label=str(temp)  + r'$\degree$C',alpha=1,capsize=3)
             obj=r'$\sigma$'
         dynamic_plot(obj, feature_var, fixed_values)
 
@@ -253,7 +253,7 @@ def on_button_clicked(_):
         a = widgets.VBox([out, ui])
         display(a)
 
-def prediction_plot_2d(wf, model_name, f): # Todo: generalize this for 2d regression problems (feature/obj::X/y)
+def prediction_plot_2d(wf, model_name, f, min_max:bool=False, confidence:float=1.0): # Todo: generalize this for 2d regression problems (feature/obj::X/y)
 
     # Initialize plot figure
     fig = plt.figure(figsize=(6,4))
@@ -263,12 +263,12 @@ def prediction_plot_2d(wf, model_name, f): # Todo: generalize this for 2d regres
     n_samples = 100
     X = np.linspace(-2, 10, n_samples)
     X_df = pd.DataFrame(X, columns=['x'])
-    pred = wf.predict(X_df, {'y':model_name}, min_max=True, return_std=True)
+    pred = wf.predict(X_df, {'y':model_name}, min_max=min_max, return_std=True)
 
     ax.fill_between(X, pred['y']-pred['y_std'], pred['y']+pred['y_std'],
                     label=r'$\pm \sigma$', color='orange', alpha=0.5, linestyle='-')
 
-    ax.errorbar(wf.X_unscaled['x'],wf.y['y'],yerr=1.96*wf.std['y_std'],xerr=None,label='Training Data', fmt='.', c='black', alpha=0.5,capsize=1)    
+    ax.errorbar(wf.X_unscaled['x'],wf.y['y'],yerr=confidence*wf.std['y_std'],xerr=None,label='Training Data', fmt='.', c='black', alpha=0.5,capsize=1)    
 
     ax.errorbar(X,f(X),label='f(x)', fmt='--', alpha=1)
     ax.errorbar(X,pred['y'],label='Prediction', color='r', alpha=1)