NNPDF · Cmurilochem · Nov 7, 2023 · Oct 24, 2023 · Oct 24, 2023 · Oct 24, 2023
diff --git a/n3fit/src/n3fit/hyper_optimization/filetrials.py b/n3fit/src/n3fit/hyper_optimization/filetrials.py
@@ -1,9 +1,11 @@
 """
     Custom hyperopt trial object for persistent file storage
-    in the form of a json file within the nnfit folder
+    in the form of json and pickle files within the nnfit folder
 """
+import pickle
 import json
 import logging
+from numpy.random._generator import Generator
 from validphys.hyperoptplot import HyperoptTrial
 from hyperopt import Trials, space_eval
 
@@ -61,9 +63,34 @@ class FileTrials(Trials):
     def __init__(self, replica_path, parameters=None, **kwargs):
         self._store_trial = False
         self._json_file = "{0}/tries.json".format(replica_path)
+        self.pkl_file = "{0}/tries.pkl".format(replica_path)
         self._parameters = parameters
+        self._rstate = None
         super().__init__(**kwargs)
 
+    @property
+    def rstate(self) -> Generator:
+        """
+        Returs the rstate attribute.
+
+        Notes:
+            Rstate stores a `numpy.random.Generator` which is important to make
+            hyperopt restarts reproducible in the hyperparameter space. It can
+            be passed later as the `rstate` parameters of `hyperopt.fmin`.
+        """
+        return self._rstate
+
+    @rstate.setter
+    def rstate(self, random_generator: Generator) -> None:
+        """
+        Sets the rstate attribute.
+
+        Example:
+            >>> trials = FileTrials(replica_path_set, parameters=parameters)
+            >>> trials.rstate = np.random.default_rng(42)
+        """
+        self._rstate = random_generator
+
     def refresh(self):
         """
         This is the "flushing" method which is called at the end of every trial to
@@ -78,9 +105,7 @@ def refresh(self):
             local_trials = []
             for idx, t in enumerate(self._dynamic_trials):
                 local_trials.append(t)
-                local_trials[idx]["misc"]["space_vals"] = space_eval_trial(
-                    self._parameters, t
-                )
+                local_trials[idx]["misc"]["space_vals"] = space_eval_trial(self._parameters, t)
 
             all_to_str = json.dumps(local_trials, default=str)
             with open(self._json_file, "w") as f:
@@ -95,3 +120,21 @@ def new_trial_ids(self, n):
     def new_trial_docs(self, tids, specs, results, miscs):
         self._store_trial = True
         return super().new_trial_docs(tids, specs, results, miscs)
+
+    def to_pkl(self):
+        """Dump `FileTrials` object into a pickle file."""
+        with open(self.pkl_file, "wb") as file:
+            pickle.dump(self, file)
+
+    @classmethod
+    def from_pkl(cls, pickle_filepath):
+        """Load and return an instance of `FileTrials` from a pickle file.
+
+        If a pickle file from previous run is present this method can be used
+            to instantiate an initial `FileTrials` object to restart.
+        """
+        try:
+            with open(pickle_filepath, "rb") as file:
+                return pickle.load(file)
+        except FileNotFoundError as err:
+            log.error("Failed to open pickle file: %s", err)
diff --git a/n3fit/src/n3fit/hyper_optimization/hyper_scan.py b/n3fit/src/n3fit/hyper_optimization/hyper_scan.py
@@ -10,17 +10,18 @@
     - a function
     - a dictionary of spaces of parameters
 you can do so by simply modifying the wrappers to point somewhere else
-(and, of course the function in the fitting action that calls the miniimization).
+(and, of course the function in the fitting action that calls the minimization).
 """
 import copy
 import hyperopt
 import numpy as np
 from n3fit.backends import MetaModel, MetaLayer
-import n3fit.hyper_optimization.filetrials as filetrials
+from n3fit.hyper_optimization.filetrials import FileTrials
 import logging
 
 log = logging.getLogger(__name__)
 
+
 # These are just wrapper around some hyperopt's sampling expresions defined in here
 # https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions
 # with a bit of extra documentation for the ones that are not obvious
@@ -88,12 +89,13 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     and performs ``max_evals`` evaluations of the hyperparametrizable function of ``model_trainer``.
 
     A ``tries.json`` file will be saved in the ``replica_path_set`` folder with the information
-    of all trials.
+    of all trials. An additional ``tries.pkl`` file will also be generated in the same folder
+    that stores the previous states of `FileTrials`. This file can be used for restarting purposes.
 
     Parameters
     -----------
         replica_path_set: path
-            folder where to create the json ``tries.json`` file
+            folder where to create the json ``tries.json`` and pickle ``tries.pkl`` files
         model_trainer: :py:class:`n3fit.ModelTrainer.ModelTrainer`
             a ``ModelTrainer`` object with the ``hyperparametrizable`` method
         hyperscanner: :py:class:`n3fit.hyper_optimization.hyper_scan.HyperScanner`
@@ -109,7 +111,15 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
     # Tell the trainer we are doing hpyeropt
     model_trainer.set_hyperopt(True, keys=hyperscanner.hyper_keys, status_ok=hyperopt.STATUS_OK)
     # Generate the trials object
-    trials = filetrials.FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+    trials = FileTrials(replica_path_set, parameters=hyperscanner.as_dict())
+    # Initialize seed for hyperopt
+    trials.rstate = np.random.default_rng(42)
+
+    # For restarts, reset the state of `FileTrials` saved in the pickle file
+    if hyperscanner.restart_hyperopt:
+        pickle_file_to_load = f"{replica_path_set}/tries.pkl"
+        log.info("Restarting hyperopt run using the pickle file %s", pickle_file_to_load)
+        trials = FileTrials.from_pkl(pickle_file_to_load)
 
     # Perform the scan
     best = hyperopt.fmin(
@@ -119,6 +129,8 @@ def hyper_scan_wrapper(replica_path_set, model_trainer, hyperscanner, max_evals=
         max_evals=max_evals,
         show_progressbar=False,
         trials=trials,
+        rstate=trials.rstate,
+        trials_save_file=trials.pkl_file,
     )
     return hyperscanner.space_eval(best)
 
@@ -174,6 +186,13 @@ def __init__(self, parameters, sampling_dict, steps=5):
         self.parameters = copy.deepcopy(parameters)
         self.steps = steps
 
+        # adding extra options for restarting
+        restart_config = sampling_dict.get("restart")
+        if restart_config:
+            self.restart_hyperopt = True
+        else:
+            self.restart_hyperopt = False
+
         self.hyper_keys = set([])
 
         if "parameters" in sampling_dict:
@@ -256,8 +275,7 @@ def stopping(self, min_epochs=None, max_epochs=None, min_patience=None, max_pati
         stopping_key = "stopping_patience"
 
         if min_epochs is not None and max_epochs is not None:
-            epochs = hp_quniform(epochs_key, min_epochs, max_epochs,
-                    step_size=1000)
+            epochs = hp_quniform(epochs_key, min_epochs, max_epochs, step_size=1000)
             self._update_param(epochs_key, epochs)
 
         if min_patience is not None or max_patience is not None:
@@ -414,8 +432,7 @@ def architecture(
             units = []
             for i in range(n):
                 units_label = "nl{0}:-{1}/{0}".format(n, i)
-                units_sampler = hp_quniform(units_label, min_units, max_units,
-                        step_size=1)
+                units_sampler = hp_quniform(units_label, min_units, max_units, step_size=1)
                 units.append(units_sampler)
             # The number of nodes in the last layer are read from the runcard
             units.append(output_size)

diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
@@ -850,7 +850,9 @@ def hyperparametrizable(self, params):
         # Initialize all photon classes for the different replicas:
         if self.lux_params:
             photons = Photon(
-                theoryid=self.theoryid, lux_params=self.lux_params, replicas=self.replicas,
+                theoryid=self.theoryid,
+                lux_params=self.lux_params,
+                replicas=self.replicas,
             )
         else:
             photons = None
@@ -860,7 +862,9 @@ def hyperparametrizable(self, params):
             # and the seed needs to be updated accordingly
             seeds = self._nn_seeds
             if k > 0:
-                seeds = [np.random.randint(0, pow(2, 31)) for _ in seeds]
+                # seeds = [np.random.randint(0, pow(2, 31)) for _ in seeds]
+                # generate seeds for each k-fold from the input `nnseeds`
+                seeds = [seed * k for seed in seeds]
 
             # Generate the pdf model
             pdf_models = self._generate_pdf(
@@ -922,7 +926,11 @@ def hyperparametrizable(self, params):
             for model in models.values():
                 model.compile(**params["optimizer"])
 
-            passed = self._train_and_fit(models["training"], stopping_object, epochs=epochs,)
+            passed = self._train_and_fit(
+                models["training"],
+                stopping_object,
+                epochs=epochs,
+            )
 
             if self.mode_hyperopt:
                 # If doing a hyperparameter scan we need to keep track of the loss function

diff --git a/n3fit/src/n3fit/scripts/n3fit_exec.py b/n3fit/src/n3fit/scripts/n3fit_exec.py
@@ -232,6 +232,8 @@ def produce_hyperscanner(self, parameters, hyperscan_config=None, hyperopt=None)
 
         if hyperscan_config is None or hyperopt is None:
             return None
+        if hyperopt and self.environment.restart:
+            hyperscan_config.update({'restart': 'true'})
         return HyperScanner(parameters, hyperscan_config)
 
 
@@ -258,6 +260,7 @@ def check_positive(value):
             return ivalue
 
         parser.add_argument("--hyperopt", help="Enable hyperopt scan", default=None, type=int)
+        parser.add_argument("--continue", help="Enable hyperopt restarts", action="store_true")
         parser.add_argument("replica", help="MC replica number", type=check_positive)
         parser.add_argument(
             "-r",
@@ -283,6 +286,7 @@ def run(self):
                 replicas = [replica]
             self.environment.replicas = NSList(replicas, nskey="replica")
             self.environment.hyperopt = self.args["hyperopt"]
+            self.environment.restart = self.args["continue"]
             super().run()
         except N3FitError as e:
             log.error(f"Error in n3fit:\n{e}")

diff --git a/n3fit/src/n3fit/tests/hyperopt/hyper-quickcard.yml b/n3fit/src/n3fit/tests/hyperopt/hyper-quickcard.yml
@@ -0,0 +1,136 @@
+#
+# Configuration file for n3fit hyperopt tests
+#
+
+############################################################
+description: n3fit hyperopt test
+
+############################################################
+# frac: training fraction
+# ewk: apply ewk k-factors
+# sys: systematics treatment (see systypes)
+dataset_inputs:
+- { dataset: NMC, frac: 0.5 }
+- { dataset: SLACP, frac: 0.5}
+- { dataset: CMSZDIFF12, frac: 0.5, cfac: ['QCD'], sys: 10 }
+
+
+############################################################
+datacuts:
+  t0pdfset: NNPDF31_nnlo_as_0118    # PDF set to generate t0 covmat
+  q2min: 3.49                        # Q2 minimum
+  w2min: 12.5                        # W2 minimum
+  combocuts: NNPDF31                 # NNPDF3.0 final kin. cuts
+  jetptcut_tev: 0                    # jet pt cut for tevatron
+  jetptcut_lhc: 0                    # jet pt cut for lhc
+  wptcut_lhc: 30.0                   # Minimum pT for W pT diff distributions
+  jetycut_tev: 1e30                  # jet rap. cut for tevatron
+  jetycut_lhc: 1e30                  # jet rap. cut for lhc
+  dymasscut_min: 0                   # dy inv.mass. min cut
+  dymasscut_max: 1e30                # dy inv.mass. max cut
+  jetcfactcut: 1e30                  # jet cfact. cut
+
+############################################################
+theory:
+  theoryid: 399     # database id
+
+hyperscan_config:
+  architecture:
+      n_layers: [2]
+      min_units: 10
+      max_units: 45
+  optimizer:
+  - optimizer_name: 'Nadam'
+    learning_rate:
+      sampling: log
+      min: 1e-4
+      max: 1e-2
+    clipnorm:
+      sampling: log
+      min: 1e-7
+      max: 1e-4
+  - optimizer_name: 'Adam'
+    learning_rate:
+      sampling: log
+      min: 1e-4
+      max: 1e-2
+    clipnorm:
+      sampling: log
+      min: 1e-7
+      max: 1e-4
+
+kfold:
+    target: average
+    penalties:
+      - saturation
+      - patience
+      - integrability
+    threshold: 500.0
+    partitions:
+        - datasets:
+            - NMC
+        - datasets:
+            - SLACP
+            - CMSZDIFF12
+
+############################################################
+trvlseed: 2182363835
+nnseed: 4044040809
+mcseed: 1977428487
+genrep: false      # true = generate MC replicas, false = use real data
+
+# The baseline parameters, best ones as taken from table 3.3 in the 2021 paper
+# or equivalently the NNPDF40_nnlo_as_01180_1000 runcard
+# These are used for parameters that are not hyperoptimized over
+parameters: # This defines the parameter dictionary that is passed to the Model Trainer
+  nodes_per_layer: [25, 20, 8]
+  activation_per_layer: [tanh, tanh, linear]
+  initializer: glorot_normal
+  optimizer:
+    clipnorm: 6.073e-6
+    learning_rate: 2.621e-3
+    optimizer_name: Nadam
+  epochs: 10
+  positivity:
+    initial: 184.8
+    multiplier:
+  integrability:
+    initial: 10
+    multiplier:
+  stopping_patience: 0.1
+  layer_type: dense
+  dropout: 0.0
+  threshold_chi2: 3.5
+
+fitting:
+  savepseudodata: false
+  # NN23(QED) = sng=0,g=1,v=2,t3=3,ds=4,sp=5,sm=6,(pht=7)
+  # EVOL(QED) = sng=0,g=1,v=2,v3=3,v8=4,t3=5,t8=6,(pht=7)
+  # EVOLS(QED)= sng=0,g=1,v=2,v8=4,t3=4,t8=5,ds=6,(pht=7)
+  # FLVR(QED) = g=0, u=1, ubar=2, d=3, dbar=4, s=5, sbar=6, (pht=7)
+  fitbasis: EVOL  # EVOL (7), EVOLQED (8), etc.
+  basis:
+  - {fl: sng, trainable: false, smallx: [1.086, 1.121], largex: [1.459, 3.165]}
+  - {fl: g,   trainable: false, smallx: [0.7832, 1.059], largex: [2.734, 8.173]}
+  - {fl: v,   trainable: false, smallx: [0.5596, 0.7524], largex: [1.534, 3.82]}
+  - {fl: v3,  trainable: false, smallx: [-0.0291, 0.5957], largex: [1.708, 3.706]}
+  - {fl: v8,  trainable: false, smallx: [0.6072, 0.819], largex: [1.519, 3.691]}
+  - {fl: t3,  trainable: false, smallx: [-0.4322, 1.087], largex: [1.718, 3.742]}
+  - {fl: t8,  trainable: false, smallx: [0.6132, 0.958], largex: [1.531, 3.506]}
+  - {fl: t15, trainable: false, smallx: [1.057, 1.142], largex: [1.469, 3.23]}
+
+############################################################
+positivity:
+  posdatasets:
+    - { dataset: POSF2U,   maxlambda: 1e6 }  # Positivity Lagrange Multiplier
+    - { dataset: POSDYS    , maxlambda: 1e5 }
+
+integrability:
+  integdatasets:
+    - {dataset: INTEGXT8, maxlambda: 1e2}
+
+############################################################
+debug: true
+maxcores: 28
+parallel_models: true
+same_trvl_per_replica: true