NNPDF · Cmurilochem · Nov 7, 2023 · Oct 24, 2023 · Oct 24, 2023 · Oct 24, 2023
diff --git a/n3fit/src/n3fit/hyper_optimization/filetrials.py b/n3fit/src/n3fit/hyper_optimization/filetrials.py
@@ -1,12 +1,15 @@
 """
     Custom hyperopt trial object for persistent file storage
-    in the form of a json file within the nnfit folder
+    in the form of json and pickle files within the nnfit folder
 """
 import json
 import logging
-from validphys.hyperoptplot import HyperoptTrial
+import pickle
+
 from hyperopt import Trials, space_eval
 
+from validphys.hyperoptplot import HyperoptTrial
+
 log = logging.getLogger(__name__)
 
 # Note: the plan would be to do a PR in hyperopt's main repository
@@ -60,10 +63,42 @@ class FileTrials(Trials):
 
     def __init__(self, replica_path, parameters=None, **kwargs):
         self._store_trial = False
-        self._json_file = "{0}/tries.json".format(replica_path)
+        self._json_file = replica_path / "tries.json"
+        self.pkl_file = replica_path / "tries.pkl"
         self._parameters = parameters
+        self._rstate = None
         super().__init__(**kwargs)
 
+    @property
+    def rstate(self):
+        """
+        Returns the rstate attribute.
+
+        Notes:
+            :func:`rstate` stores a `numpy.random.Generator` which is important to make
+            hyperopt restarts reproducible in the hyperparameter space. It can
+            be passed later as the `rstate` parameters of `hyperopt.fmin`.
+        """
+        return self._rstate
+
+    @rstate.setter
+    def rstate(self, random_generator):
+        """
+        Sets the rstate attribute.
+
+        # Arguments:
+            - `random_generator`: `numpy.random.Generator`
+
+        Example
+        --------
+        >>> import numpy as np
+        >>> from n3fit.hyper_optimization.filetrials import FileTrials
+        >>>
+        >>> trials = FileTrials(replica_path_set, parameters=parameters)
+        >>> trials.rstate = np.random.default_rng(42)
+        """
+        self._rstate = random_generator
+
     def refresh(self):
         """
         This is the "flushing" method which is called at the end of every trial to
@@ -78,9 +113,7 @@ def refresh(self):
             local_trials = []
             for idx, t in enumerate(self._dynamic_trials):
                 local_trials.append(t)
-                local_trials[idx]["misc"]["space_vals"] = space_eval_trial(
-                    self._parameters, t
-                )
+                local_trials[idx]["misc"]["space_vals"] = space_eval_trial(self._parameters, t)
 
             all_to_str = json.dumps(local_trials, default=str)
             with open(self._json_file, "w") as f:
@@ -95,3 +128,25 @@ def new_trial_ids(self, n):
     def new_trial_docs(self, tids, specs, results, miscs):
         self._store_trial = True
         return super().new_trial_docs(tids, specs, results, miscs)
+
+    def to_pkl(self):
+        """Dump `FileTrials` object into a pickle file."""
+        with open(self.pkl_file, "wb") as file:
+            pickle.dump(self, file)
+
+    @classmethod
+    def from_pkl(cls, pickle_filepath):
+        """
+        Load and return an instance of `FileTrials` from a pickle file.
+
+        If a pickle file from previous run is present this method can be used
+            to instantiate an initial `FileTrials` object to restart.
+        """
+        try:
+            with open(pickle_filepath, "rb") as file:
+                return pickle.load(file)
+        except FileNotFoundError as err:
+            raise FileNotFoundError(
+                "Failed to open 'tries.pkl' pickle file for restarting. "
+                f"Please ensure it is located in: {pickle_filepath}"
+            ) from err
diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -10,17 +10,22 @@
     - a function
     - a dictionary of spaces of parameters
 you can do so by simply modifying the wrappers to point somewhere else
-(and, of course the function in the fitting action that calls the miniimization).
+(and, of course the function in the fitting action that calls the minimization).
 """
 import copy
+import logging
+
 import hyperopt
 import numpy as np
-from n3fit.backends import MetaModel, MetaLayer
-import n3fit.hyper_optimization.filetrials as filetrials
-import logging
+
+from n3fit.backends import MetaLayer, MetaModel
+from n3fit.hyper_optimization.filetrials import FileTrials
 
 log = logging.getLogger(__name__)
 
+HYPEROPT_SEED = 42
+
+
 # These are just wrapper around some hyperopt's sampling expresions defined in here
 # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions
 # with a bit of extra documentation for the ones that are not obvious
@@ -88,12 +93,13 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     and performs ``max_evals`` evaluations of the hyperparametrizable function of ``model_trainer``.
 
     A ``tries.json`` file will be saved in the ``replica_path_set`` folder with the information
-    of all trials.
+    of all trials. An additional ``tries.pkl`` file will also be generated in the same folder
+    that stores the previous states of `FileTrials`, this file can be used for restarting purposes.
 
     Parameters
     -----------
         replica_path_set: path
-            folder where to create the json ``tries.json`` file
+            folder where to create the ``tries.json`` and ``tries.pkl`` files
         model_trainer: :py:class:`n3fit.ModelTrainer.ModelTrainer`
             a ``ModelTrainer`` object with the ``hyperparametrizable`` method
         hyperscanner: :py:class:`n3fit.hyper_optimization.hyper_scan.HyperScanner`
@@ -109,7 +115,15 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     # Tell the trainer we are doing hpyeropt
     model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys, status_ok=hyperopt.STATUS_OK)
     # Generate the trials object
-    trials = filetrials.FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+    trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+    # Initialize seed for hyperopt
+    trials.rstate = np.random.default_rng(HYPEROPT_SEED)
+
+    # For restarts, reset the state of `FileTrials` saved in the pickle file
+    if hyperscanner.restart_hyperopt:
+        pickle_file_to_load = f"{replica_path_set}/tries.pkl"
+        log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
+        trials = FileTrials.from_pkl(pickle_file_to_load)
 
     # Perform the scan
     best = hyperopt.fmin(
@@ -119,6 +133,8 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
         max_evals=max_evals,
         show_progressbar=False,
         trials=trials,
+        rstate=trials.rstate,
+        trials_save_file=trials.pkl_file,
     )
     return hyperscanner.space_eval(best)
 
@@ -174,6 +190,10 @@ def __init__(self, parameters, sampling_dict, steps=5):
         self.parameters = copy.deepcopy(parameters)
         self.steps = steps
 
+        # adding extra options for restarting
+        restart_config = sampling_dict.get("restart")
+        self.restart_hyperopt = True if restart_config else False
+
         self.hyper_keys = set([])
 
         if "parameters" in sampling_dict:
@@ -256,8 +276,7 @@ def stopping(self, min_epochs=None, max_epochs=None, min_patience=None, max_pati
         stopping_key = "stopping_patience"
 
         if min_epochs is not None and max_epochs is not None:
-            epochs = hp_quniform(epochs_key, min_epochs, max_epochs,
-                    step_size=1000)
+            epochs = hp_quniform(epochs_key, min_epochs, max_epochs, step_size=1)
             self._update_param(epochs_key, epochs)
 
         if min_patience is not None or max_patience is not None:
@@ -333,11 +352,7 @@ def optimizer(self, optimizers):
         self._update_param(opt_key, opt_val)
 
     def positivity(
-        self,
-        min_multiplier=None,
-        max_multiplier=None,
-        min_initial=None,
-        max_initial=None,
+        self, min_multiplier=None, max_multiplier=None, min_initial=None, max_initial=None
     ):
         """
         Modifies the following entries of the `parameters` dictionary:
@@ -414,8 +429,7 @@ def architecture(
             units = []
             for i in range(n):
                 units_label = "nl{0}:-{1}/{0}".format(n, i)
-                units_sampler = hp_quniform(units_label, min_units, max_units,
-                        step_size=1)
+                units_sampler = hp_quniform(units_label, min_units, max_units, step_size=1)
                 units.append(units_sampler)
             # The number of nodes in the last layer are read from the runcard
             units.append(output_size)

diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
@@ -222,13 +222,7 @@ def __init__(
             "folds": [],
             "posdatasets": [],
         }
-        self.experimental = {
-            "output": [],
-            "expdata": [],
-            "ndata": 0,
-            "model": None,
-            "folds": [],
-        }
+        self.experimental = {"output": [], "expdata": [], "ndata": 0, "model": None, "folds": []}
 
         self._fill_the_dictionaries()
 
@@ -483,11 +477,7 @@ def _model_generation(self, xinput, pdf_models, partition, partition_idx):
             except ValueError:
                 pass
 
-        models = {
-            "training": training,
-            "validation": validation,
-            "experimental": experimental,
-        }
+        models = {"training": training, "validation": validation, "experimental": experimental}
 
         return models
 
@@ -850,7 +840,7 @@ def hyperparametrizable(self, params):
         # Initialize all photon classes for the different replicas:
         if self.lux_params:
             photons = Photon(
-                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas,
+                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas
             )
         else:
             photons = None
@@ -860,7 +850,11 @@ def hyperparametrizable(self, params):
             # and the seed needs to be updated accordingly
             seeds = self._nn_seeds
             if k > 0:
-                seeds = [np.random.randint(0, pow(2, 31)) for _ in seeds]
+                # generate random integers for each k-fold from the input `nnseeds`
+                # we generate new seeds to avoid the integer overflow that may
+                # occur when doing k*nnseeds
+                rngs = [np.random.default_rng(seed=seed) for seed in seeds]
+                seeds = [generator.integers(1, pow(2, 30)) * k for generator in rngs]
 
             # Generate the pdf model
             pdf_models = self._generate_pdf(
@@ -922,7 +916,7 @@ def hyperparametrizable(self, params):
             for model in models.values():
                 model.compile(**params["optimizer"])
 
-            passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs,)
+            passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs)
 
             if self.mode_hyperopt:
                 # If doing a hyperparameter scan we need to keep track of the loss function

diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py
@@ -232,6 +232,8 @@ def produce_hyperscanner(self, parameters, hyperscan_config=None, hyperopt=None)
 
         if hyperscan_config is None or hyperopt is None:
             return None
+        if hyperopt and self.environment.restart:
+            hyperscan_config.update({'restart': 'true'})
         return HyperScanner(parameters, hyperscan_config)
 
 
@@ -258,6 +260,7 @@ def check_positive(value):
             return ivalue
 
         parser.add_argument("--hyperopt", help="Enable hyperopt scan", default=None, type=int)
+        parser.add_argument("--restart", help="Enable hyperopt restarts", action="store_true")
         parser.add_argument("replica", help="MC replica number", type=check_positive)
         parser.add_argument(
             "-r",
@@ -283,6 +286,7 @@ def run(self):
                 replicas = [replica]
             self.environment.replicas = NSList(replicas, nskey="replica")
             self.environment.hyperopt = self.args["hyperopt"]
+            self.environment.restart = self.args["restart"]
             super().run()
         except N3FitError as e:
             log.error(f"Error in n3fit:\n{e}")

diff --git a/n3fit/src/n3fit/tests/test_hyperopt.py b/n3fit/src/n3fit/tests/test_hyperopt.py
@@ -1,13 +1,81 @@
 """
     Test hyperoptimization features
 """
+import json
+import pathlib
+import shutil
+import subprocess as sp
 
 from numpy.testing import assert_approx_equal
+
 from n3fit.hyper_optimization import rewards
 
+
 def test_rewards():
-    """ Ensure that rewards continue doing what they are supposed to do """
+    """Ensure that rewards continue doing what they are supposed to do"""
     losses = [0.0, 1.0, 2.0]
     assert_approx_equal(rewards.average(losses), 1.0)
     assert_approx_equal(rewards.best_worst(losses), 2.0)
     assert_approx_equal(rewards.std(losses), 0.816496580927726)
+
+
+REGRESSION_FOLDER = pathlib.Path(__file__).with_name("regressions")
+QUICKNAME = "quickcard"
+EXE = "n3fit"
+REPLICA = "1"
+
+
+def load_data(info_file):
+    """Loads the information of the fit from the json files"""
+    with open(info_file, "r", encoding='utf-8') as file:
+        return json.load(file)
+
+
+def test_restart_from_pickle(tmp_path):
+    """Ensure that our hyperopt restart works as expected"""
+    # Prepare the run
+    quickcard = f"hyper-{QUICKNAME}.yml"
+    quickpath = REGRESSION_FOLDER / quickcard
+    # Set up some options
+    n_trials_stop = 2
+    n_trials_total = 4
+    output_restart = tmp_path / f"run_{n_trials_stop}_trials_and_then_{n_trials_total}_trials"
+    output_direct = tmp_path / f"run_{n_trials_total}_trials"
+
+    # cp runcard to tmp folder
+    shutil.copy(quickpath, tmp_path)
+    # run some trials for the first time
+    sp.run(
+        f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_stop} " f"-o {output_restart}".split(),
+        cwd=tmp_path,
+        check=True,
+    )
+    # restart and calculate more trials
+    sp.run(
+        f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_total} "
+        f"-o {output_restart} --restart".split(),
+        cwd=tmp_path,
+        check=True,
+    )
+    # start again and calculate all trials at once
+    sp.run(
+        f"{EXE} {quickpath} {REPLICA} --hyperopt {n_trials_total} " f"-o {output_direct}".split(),
+        cwd=tmp_path,
+        check=True,
+    )
+
+    # read up generated json files
+    restart_json_path = f"{output_restart}/nnfit/replica_{REPLICA}/tries.json"
+    restart_json = load_data(restart_json_path)
+    direct_json_path = f"{output_direct}/nnfit/replica_{REPLICA}/tries.json"
+    direct_json = load_data(direct_json_path)
+
+    # minimum check: the generated list of nested dictionaries have same lenght
+    assert len(restart_json) == len(direct_json)
+
+    for i in range(n_trials_total):
+        # check that the files share exactly the same hyperopt history
+        assert restart_json[i]['misc'] == direct_json[i]['misc']
+        assert restart_json[i]['state'] == direct_json[i]['state']
+        assert restart_json[i]['tid'] == direct_json[i]['tid']
+        assert restart_json[i]['result'] == direct_json[i]['result']