NNPDF · Cmurilochem · Nov 7, 2023 · Oct 24, 2023 · Oct 24, 2023 · Oct 24, 2023
diff --git a/doc/sphinx/source/n3fit/hyperopt.rst b/doc/sphinx/source/n3fit/hyperopt.rst
@@ -320,10 +320,10 @@ Changing the hyperoptimization target
 -----------------------------------
 
 Beyond the usual :math:`\chi2`-based optimization figures above, it is possible to utilize other measures as the target for hyperoptimization.
-One possibility is to use a :ref:`future test<futuretests>`-based metric for which the goal is not to get the minimum :math:`\chi2` but to get the same :math:`\chi2` (with PDF errors considered) for different datasets. The idea is that this way we select models of which the prediction is stable upon variations in the dataset. 
+One possibility is to use a :ref:`future test<futuretests>`-based metric for which the goal is not to get the minimum :math:`\chi2` but to get the same :math:`\chi2` (with PDF errors considered) for different datasets. The idea is that this way we select models of which the prediction is stable upon variations in the dataset.
 In order to obtain the PDF errors used in the figure of merit it is necessary to run multiple replicas, luckily ``n3fit`` provides such a possibility also during hyperoptimization.
 
-Take the following modifications to a normal hyperopt runcard 
+Take the following modifications to a normal hyperopt runcard
 (note that for convenience we take the trials directly from a previous run, so we don't have to create a new
 hyperopt configuration dictionary).
 
@@ -345,7 +345,7 @@ hyperopt configuration dictionary).
 
         kfold:
           target: fit_future_tests
-          partitions: 
+          partitions:
           - datasets:
             - HERACOMBCCEP
             - HERACOMBCCEM
@@ -370,3 +370,19 @@ The figure of merit will be the difference between the :math:`\chi2` of the seco
 
 .. math::
    L_{\rm hyperopt} = \chi^{2}_{(1) \rm pdferr} - \chi^{2}_{(2)}
+
+
+Restarting hyperoptimization runs
+---------------------------------
+
+In addition to the ``tries.json`` files, hyperparameter scans also produce ``tries.pkl`` `pickle <https://docs.python.org/3/library/pickle.html>`_ files,
+which are located in the same directory as the corresponding ``tries.json`` file.
+The generated ``tries.pkl`` file stores the complete history of a previous hyperoptimization run, making it possible to resume the process using the ``hyperopt`` framework.
+To achieve this, you can use the ``--restart`` option within the ``n3fit`` command, e.g.,:
+
+.. code-block:: bash
+
+   n3fit runcard.yml 1 -r 10 --hyperopt 20 --restart
+
+The above command example is effective when the number of saved trials in the ``test_run/nnfit/replica_1/tries.pkl`` is
+less than ``20``. If there are ``20`` or more saved trials, ``n3fit`` will simply terminate, displaying the best results.
diff --git a/n3fit/src/n3fit/hyper_optimization/filetrials.py b/n3fit/src/n3fit/hyper_optimization/filetrials.py
@@ -1,12 +1,15 @@
 """
     Custom hyperopt trial object for persistent file storage
-    in the form of a json file within the nnfit folder
+    in the form of json and pickle files within the nnfit folder
 """
 import json
 import logging
-from validphys.hyperoptplot import HyperoptTrial
+import pickle
+
 from hyperopt import Trials, space_eval
 
+from validphys.hyperoptplot import HyperoptTrial
+
 log = logging.getLogger(__name__)
 
 # Note: the plan would be to do a PR in hyperopt's main repository
@@ -60,10 +63,42 @@ class FileTrials(Trials):
 
     def __init__(self, replica_path, parameters=None, **kwargs):
         self._store_trial = False
-        self._json_file = "{0}/tries.json".format(replica_path)
+        self._json_file = replica_path / "tries.json"
+        self.pkl_file = replica_path / "tries.pkl"
         self._parameters = parameters
+        self._rstate = None
         super().__init__(**kwargs)
 
+    @property
+    def rstate(self):
+        """
+        Returns the rstate attribute.
+
+        Notes:
+            :func:`rstate` stores a `numpy.random.Generator` which is important to make
+            hyperopt restarts reproducible in the hyperparameter space. It can
+            be passed later as the `rstate` parameters of `hyperopt.fmin`.
+        """
+        return self._rstate
+
+    @rstate.setter
+    def rstate(self, random_generator):
+        """
+        Sets the rstate attribute.
+
+        # Arguments:
+            - `random_generator`: `numpy.random.Generator`
+
+        Example
+        --------
+        >>> import numpy as np
+        >>> from n3fit.hyper_optimization.filetrials import FileTrials
+        >>>
+        >>> trials = FileTrials(replica_path_set, parameters=parameters)
+        >>> trials.rstate = np.random.default_rng(42)
+        """
+        self._rstate = random_generator
+
     def refresh(self):
         """
         This is the "flushing" method which is called at the end of every trial to
@@ -78,9 +113,7 @@ def refresh(self):
             local_trials = []
             for idx, t in enumerate(self._dynamic_trials):
                 local_trials.append(t)
-                local_trials[idx]["misc"]["space_vals"] = space_eval_trial(
-                    self._parameters, t
-                )
+                local_trials[idx]["misc"]["space_vals"] = space_eval_trial(self._parameters, t)
 
             all_to_str = json.dumps(local_trials, default=str)
             with open(self._json_file, "w") as f:
@@ -95,3 +128,25 @@ def new_trial_ids(self, n):
     def new_trial_docs(self, tids, specs, results, miscs):
         self._store_trial = True
         return super().new_trial_docs(tids, specs, results, miscs)
+
+    def to_pkl(self):
+        """Dump `FileTrials` object into a pickle file."""
+        with open(self.pkl_file, "wb") as file:
+            pickle.dump(self, file)
+
+    @classmethod
+    def from_pkl(cls, pickle_filepath):
+        """
+        Load and return an instance of `FileTrials` from a pickle file.
+
+        If a pickle file from previous run is present this method can be used
+            to instantiate an initial `FileTrials` object to restart.
+        """
+        try:
+            with open(pickle_filepath, "rb") as file:
+                return pickle.load(file)
+        except FileNotFoundError as err:
+            raise FileNotFoundError(
+                "Failed to open 'tries.pkl' pickle file for restarting. "
+                f"Please ensure it is located in: {pickle_filepath}"
+            ) from err
diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -10,17 +10,22 @@
     - a function
     - a dictionary of spaces of parameters
 you can do so by simply modifying the wrappers to point somewhere else
-(and, of course the function in the fitting action that calls the miniimization).
+(and, of course the function in the fitting action that calls the minimization).
 """
 import copy
+import logging
+
 import hyperopt
 import numpy as np
-from n3fit.backends import MetaModel, MetaLayer
-import n3fit.hyper_optimization.filetrials as filetrials
-import logging
+
+from n3fit.backends import MetaLayer, MetaModel
+from n3fit.hyper_optimization.filetrials import FileTrials
 
 log = logging.getLogger(__name__)
 
+HYPEROPT_SEED = 42
+
+
 # These are just wrapper around some hyperopt's sampling expresions defined in here
 # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions
 # with a bit of extra documentation for the ones that are not obvious
@@ -88,12 +93,13 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     and performs ``max_evals`` evaluations of the hyperparametrizable function of ``model_trainer``.
 
     A ``tries.json`` file will be saved in the ``replica_path_set`` folder with the information
-    of all trials.
+    of all trials. An additional ``tries.pkl`` file will also be generated in the same folder
+    that stores the previous states of `FileTrials`, this file can be used for restarting purposes.
 
     Parameters
     -----------
         replica_path_set: path
-            folder where to create the json ``tries.json`` file
+            folder where to create the ``tries.json`` and ``tries.pkl`` files
         model_trainer: :py:class:`n3fit.ModelTrainer.ModelTrainer`
             a ``ModelTrainer`` object with the ``hyperparametrizable`` method
         hyperscanner: :py:class:`n3fit.hyper_optimization.hyper_scan.HyperScanner`
@@ -109,7 +115,15 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     # Tell the trainer we are doing hpyeropt
     model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys, status_ok=hyperopt.STATUS_OK)
     # Generate the trials object
-    trials = filetrials.FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+    trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+    # Initialize seed for hyperopt
+    trials.rstate = np.random.default_rng(HYPEROPT_SEED)
+
+    # For restarts, reset the state of `FileTrials` saved in the pickle file
+    if hyperscanner.restart_hyperopt:
+        pickle_file_to_load = f"{replica_path_set}/tries.pkl"
+        log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
+        trials = FileTrials.from_pkl(pickle_file_to_load)
 
     # Perform the scan
     best = hyperopt.fmin(
@@ -119,6 +133,8 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
         max_evals=max_evals,
         show_progressbar=False,
         trials=trials,
+        rstate=trials.rstate,
+        trials_save_file=trials.pkl_file,
     )
     return hyperscanner.space_eval(best)
 
@@ -174,6 +190,10 @@ def __init__(self, parameters, sampling_dict, steps=5):
         self.parameters = copy.deepcopy(parameters)
         self.steps = steps
 
+        # adding extra options for restarting
+        restart_config = sampling_dict.get("restart")
+        self.restart_hyperopt = True if restart_config else False
+
         self.hyper_keys = set([])
 
         if "parameters" in sampling_dict:
@@ -256,8 +276,7 @@ def stopping(self, min_epochs=None, max_epochs=None, min_patience=None, max_pati
         stopping_key = "stopping_patience"
 
         if min_epochs is not None and max_epochs is not None:
-            epochs = hp_quniform(epochs_key, min_epochs, max_epochs,
-                    step_size=1000)
+            epochs = hp_quniform(epochs_key, min_epochs, max_epochs, step_size=1)
             self._update_param(epochs_key, epochs)
 
         if min_patience is not None or max_patience is not None:
@@ -333,11 +352,7 @@ def optimizer(self, optimizers):
         self._update_param(opt_key, opt_val)
 
     def positivity(
-        self,
-        min_multiplier=None,
-        max_multiplier=None,
-        min_initial=None,
-        max_initial=None,
+        self, min_multiplier=None, max_multiplier=None, min_initial=None, max_initial=None
     ):
         """
         Modifies the following entries of the `parameters` dictionary:
@@ -414,8 +429,7 @@ def architecture(
             units = []
             for i in range(n):
                 units_label = "nl{0}:-{1}/{0}".format(n, i)
-                units_sampler = hp_quniform(units_label, min_units, max_units,
-                        step_size=1)
+                units_sampler = hp_quniform(units_label, min_units, max_units, step_size=1)
                 units.append(units_sampler)
             # The number of nodes in the last layer are read from the runcard
             units.append(output_size)

diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
@@ -222,13 +222,7 @@ def __init__(
             "folds": [],
             "posdatasets": [],
         }
-        self.experimental = {
-            "output": [],
-            "expdata": [],
-            "ndata": 0,
-            "model": None,
-            "folds": [],
-        }
+        self.experimental = {"output": [], "expdata": [], "ndata": 0, "model": None, "folds": []}
 
         self._fill_the_dictionaries()
 
@@ -483,11 +477,7 @@ def _model_generation(self, xinput, pdf_models, partition, partition_idx):
             except ValueError:
                 pass
 
-        models = {
-            "training": training,
-            "validation": validation,
-            "experimental": experimental,
-        }
+        models = {"training": training, "validation": validation, "experimental": experimental}
 
         return models
 
@@ -850,7 +840,7 @@ def hyperparametrizable(self, params):
         # Initialize all photon classes for the different replicas:
         if self.lux_params:
             photons = Photon(
-                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas,
+                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas
             )
         else:
             photons = None
@@ -860,7 +850,11 @@ def hyperparametrizable(self, params):
             # and the seed needs to be updated accordingly
             seeds = self._nn_seeds
             if k > 0:
-                seeds = [np.random.randint(0, pow(2, 31)) for _ in seeds]
+                # generate random integers for each k-fold from the input `nnseeds`
+                # we generate new seeds to avoid the integer overflow that may
+                # occur when doing k*nnseeds
+                rngs = [np.random.default_rng(seed=seed) for seed in seeds]
+                seeds = [generator.integers(1, pow(2, 30)) * k for generator in rngs]
 
             # Generate the pdf model
             pdf_models = self._generate_pdf(
@@ -922,7 +916,7 @@ def hyperparametrizable(self, params):
             for model in models.values():
                 model.compile(**params["optimizer"])
 
-            passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs,)
+            passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs)
 
             if self.mode_hyperopt:
                 # If doing a hyperparameter scan we need to keep track of the loss function

diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py
@@ -232,6 +232,8 @@ def produce_hyperscanner(self, parameters, hyperscan_config=None, hyperopt=None)
 
         if hyperscan_config is None or hyperopt is None:
             return None
+        if hyperopt and self.environment.restart:
+            hyperscan_config.update({'restart': 'true'})
         return HyperScanner(parameters, hyperscan_config)
 
 
@@ -258,6 +260,7 @@ def check_positive(value):
             return ivalue
 
         parser.add_argument("--hyperopt", help="Enable hyperopt scan", default=None, type=int)
+        parser.add_argument("--restart", help="Enable hyperopt restarts", action="store_true")
         parser.add_argument("replica", help="MC replica number", type=check_positive)
         parser.add_argument(
             "-r",
@@ -283,6 +286,7 @@ def run(self):
                 replicas = [replica]
             self.environment.replicas = NSList(replicas, nskey="replica")
             self.environment.hyperopt = self.args["hyperopt"]
+            self.environment.restart = self.args["restart"]
             super().run()
         except N3FitError as e:
             log.error(f"Error in n3fit:\n{e}")