Merge pull request #1259 from cta-observatory/rf_event_weighting

Computation of pointing-dependent weights for MC events
cta-observatory · Jun 21, 2024 · 609a977 · 609a977
2 parents 6ea82fe + 5974bb8
commit 609a977
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 7 deletions.
diff --git a/lstchain/data/lstchain_standard_config.json b/lstchain/data/lstchain_standard_config.json
@@ -65,6 +65,10 @@
     "fraction_cleaning_intensity": 0.03
   },
 
+  "random_forest_weight_settings": {
+    "pointing_wise_weights": true
+  },
+
   "random_forest_energy_regressor_args": {
     "max_depth": 30,
     "min_samples_leaf": 10,

diff --git a/lstchain/reco/dl1_to_dl2.py b/lstchain/reco/dl1_to_dl2.py
@@ -79,7 +79,8 @@ def train_energy(train, custom_config=None):
 
     reg = model(**energy_regression_args)
     reg.fit(train[features],
-            train['log_mc_energy'])
+            train['log_mc_energy'],
+            sample_weight=train['weight'])
 
     logger.info("Model {} trained!".format(model))
     return reg
@@ -118,7 +119,7 @@ def train_disp_vector(train, custom_config=None, predict_features=None):
     reg = model(**disp_regression_args)
     x = train[features]
     y = np.transpose([train[f] for f in predict_features])
-    reg.fit(x, y)
+    reg.fit(x, y, sample_weight=train['weight'])
 
     logger.info("Model {} trained!".format(model))
 
@@ -152,7 +153,7 @@ def train_disp_norm(train, custom_config=None, predict_feature='disp_norm'):
     reg = model(**disp_regression_args)
     x = train[features]
     y = np.transpose(train[predict_feature])
-    reg.fit(x, y)
+    reg.fit(x, y, sample_weight=train['weight'])
 
     logger.info("Model {} trained!".format(model))
 
@@ -186,7 +187,7 @@ def train_disp_sign(train, custom_config=None, predict_feature='disp_sign'):
     clf = model(**classification_args)
     x = train[features]
     y = np.transpose(train[predict_feature])
-    clf.fit(x, y)
+    clf.fit(x, y, sample_weight=train['weight'])
 
     logger.info("Model {} trained!".format(model))
 
@@ -223,15 +224,17 @@ def train_reco(train, custom_config=None):
 
     reg_energy = model(**energy_regression_args)
     reg_energy.fit(train[energy_features],
-                   train['log_mc_energy'])
+                   train['log_mc_energy'], sample_weight = train['weight']
+    )
 
     logger.info("Random Forest trained!")
     logger.info("Given disp_features: ", disp_features)
     logger.info("Training Random Forest Regressor for disp_norm Reconstruction...")
 
     reg_disp = RandomForestRegressor(**disp_regression_args)
     reg_disp.fit(train[disp_features],
-                 train['disp_norm'])
+                 train['disp_norm'],
+                 sample_weight=train['weight'])
 
     logger.info("Random Forest trained!")
     logger.info("Done!")
@@ -267,7 +270,8 @@ def train_sep(train, custom_config=None):
     clf = model(**classification_args)
 
     clf.fit(train[features],
-            train['mc_type'])
+            train['mc_type'],
+            sample_weight=train['weight'])
     logger.info("Random Forest trained!")
     return clf
 
@@ -345,6 +349,12 @@ def build_models(filegammas, fileprotons,
     # Adding a filter on mc_type just for training
     events_filters['mc_type'] = [-9000, np.inf]
 
+    pointing_wise_weights = False
+    if 'random_forest_weight_settings' in config:
+        if config['random_forest_weight_settings']['pointing_wise_weights']:
+            logger.info("Pointing-wise event weighting activated")
+            pointing_wise_weights = True
+
     df_gamma = pd.read_hdf(filegammas, key=dl1_params_lstcam_key)
     df_proton = pd.read_hdf(fileprotons, key=dl1_params_lstcam_key)
 
@@ -431,6 +441,14 @@ def build_models(filegammas, fileprotons,
     src_r_max = config['train_gamma_src_r_deg'][1]
     df_gamma = utils.apply_src_r_cut(df_gamma, src_r_min, src_r_max)
 
+    if pointing_wise_weights:
+        # Give same total weight to all events in every pointing node, by
+        # applying event-wise weights which depend on the statistics per node
+        # The weight is written to a new column of df_gamma, called 'weight'
+        _, _ = utils.compute_rf_event_weights(df_gamma)
+    else:
+        df_gamma['weight'] = np.ones(len(df_gamma))
+
     # Train regressors for energy and disp_norm reconstruction, only with gammas
     n_gamma_regressors = config["n_training_events"]["gamma_regressors"]
     if n_gamma_regressors not in [1.0, None]:
@@ -491,6 +509,14 @@ def build_models(filegammas, fileprotons,
                 "The requested number of protons for the classifier training is not valid."
             ) from e
 
+    if pointing_wise_weights:
+        # Give same total weight to all events in every pointing node, by
+        # applying event-wise weights which depend on the statistics per node
+        # The weight is written to a new column of df_gamma, called 'weight'
+        _, _ = utils.compute_rf_event_weights(df_proton)
+    else:
+        df_proton['weight'] = np.ones(len(df_proton))
+
     test = pd.concat([testg, df_proton], ignore_index=True)
 
     temp_reg_energy = train_energy(train, custom_config=config)

diff --git a/lstchain/reco/tests/test_utils.py b/lstchain/reco/tests/test_utils.py
@@ -237,3 +237,14 @@ def test_apply_src_r_cut(simulated_dl1_file):
     src_r_max = srcdep_config['train_gamma_src_r_deg'][1]
     params = apply_src_r_cut(params, src_r_min, src_r_max)
     assert (params.event_id.values == np.arange(100, 110, 1)).all()
+
+def test_compute_rf_event_weights():
+    from lstchain.reco.utils import compute_rf_event_weights
+
+    alt_tel = np.append(3*[0.41109], 6*[0.53508])
+    az_tel = np.append(3*[1.32615], 6*[1.38195])
+
+    df = pd.DataFrame({"alt_tel": alt_tel, "az_tel": az_tel})
+    _, _ = compute_rf_event_weights(df)
+    np.testing.assert_array_equal(df['weight'],
+                                  np.append(3*[4.5/3], 6*[4.5/6]))
diff --git a/lstchain/reco/utils.py b/lstchain/reco/utils.py
@@ -32,6 +32,7 @@
     "cartesian_to_polar",
     "clip_alt",
     "compute_alpha",
+    "compute_rf_event_weights",
     "compute_theta2",
     "expand_tel_list",
     "extract_source_position",
@@ -862,3 +863,74 @@ def get_events_in_GTI(events, CatB_cal_table):
     gti_mask = gti[events['calibration_id']]
 
     return events[gti_mask]
+
+def compute_rf_event_weights(events):
+    """
+    Compute event-wise weights. Can be used for correcting for the different
+    statistics present in each pointing node of the MC training sample,
+    to avoid "jumps" in the performance of the random forests
+
+    Parameters
+    ----------
+    events : `~pd.DataFrame`
+        DL1 parameters dataframe. The table is modified in place by the
+    addition of a column called 'weight' (unless it exists already). The
+    column contains an event-wise weight to be used in the Random Forest
+    training, to give each of the telescope pointings in the training sample
+    the same overall weight in the training.
+
+    Returns
+    -------
+
+    pointings: ndarray of shape (number_of_pointings, 2) Alt Az (in radians)
+        for each of the identified telescope pointings in the input MC sample
+
+    weight_per_pointing: ndarray [number_of_pointings] weight for each of the
+    identified pointings. The weight is equal to the mean number of training
+    events per node divided by the number of training events in the specific
+    node. If used as sample_weight in scikit-learn, each node will have the
+    same total weight in the training of the Random Forests
+
+    """
+
+    if 'weight' in events.columns:
+        log.warning("compute_rf_event_weights: DL2 table already contains")
+        log.warning("a column called weight. It will NOT be overwritten!")
+        return None, None
+
+    # Add a 'weight' column to the input table
+    weights = np.array(np.ones(len(events)))
+
+    # First identify existing telescope pointings in the sample:
+    # Convert to degrees and round to avoid potential
+    # rounding issues:
+    alt = np.round(events['alt_tel'], decimals=5)
+    az = np.round(events['az_tel'], decimals=5)
+    pointings = np.unique([alt, az], axis=1).T
+
+    # Find the total statistics in each of the pointings:
+    stats = []
+    for tel_alt_az in pointings:
+        mask = (np.isclose(tel_alt_az[0], events['alt_tel'],
+                           atol=1e-5, rtol=0) &
+                np.isclose(tel_alt_az[1], events['az_tel'],
+                           atol=1e-5, rtol=0))
+        stats.append(mask.sum())
+
+    stats = np.array(stats)
+    weight_per_pointing = stats.mean() / stats
+
+    # Now set the weights.
+    # Weight in a given node will be mean_events_per_node / n_events_in_node
+
+    for ipointing, tel_alt_az in enumerate(pointings):
+        mask = (np.isclose(tel_alt_az[0], events['alt_tel'],
+                           atol=1e-5, rtol=0) &
+                np.isclose(tel_alt_az[1], events['az_tel'],
+                           atol=1e-5, rtol=0))
+        weights[mask] = weight_per_pointing[ipointing]
+
+    events['weight'] = weights
+
+    # return the identified pointings and weights set (for checks)
+    return pointings, weight_per_pointing