diff --git a/lstchain/data/lstchain_standard_config.json b/lstchain/data/lstchain_standard_config.json index 2a5bcb4ef9..d63c9e0f8e 100644 --- a/lstchain/data/lstchain_standard_config.json +++ b/lstchain/data/lstchain_standard_config.json @@ -65,6 +65,10 @@ "fraction_cleaning_intensity": 0.03 }, + "random_forest_weight_settings": { + "pointing_wise_weights": true + }, + "random_forest_energy_regressor_args": { "max_depth": 30, "min_samples_leaf": 10, diff --git a/lstchain/reco/dl1_to_dl2.py b/lstchain/reco/dl1_to_dl2.py index 464ed8698e..dd4c76195d 100644 --- a/lstchain/reco/dl1_to_dl2.py +++ b/lstchain/reco/dl1_to_dl2.py @@ -79,7 +79,8 @@ def train_energy(train, custom_config=None): reg = model(**energy_regression_args) reg.fit(train[features], - train['log_mc_energy']) + train['log_mc_energy'], + sample_weight=train['weight']) logger.info("Model {} trained!".format(model)) return reg @@ -118,7 +119,7 @@ def train_disp_vector(train, custom_config=None, predict_features=None): reg = model(**disp_regression_args) x = train[features] y = np.transpose([train[f] for f in predict_features]) - reg.fit(x, y) + reg.fit(x, y, sample_weight=train['weight']) logger.info("Model {} trained!".format(model)) @@ -152,7 +153,7 @@ def train_disp_norm(train, custom_config=None, predict_feature='disp_norm'): reg = model(**disp_regression_args) x = train[features] y = np.transpose(train[predict_feature]) - reg.fit(x, y) + reg.fit(x, y, sample_weight=train['weight']) logger.info("Model {} trained!".format(model)) @@ -186,7 +187,7 @@ def train_disp_sign(train, custom_config=None, predict_feature='disp_sign'): clf = model(**classification_args) x = train[features] y = np.transpose(train[predict_feature]) - clf.fit(x, y) + clf.fit(x, y, sample_weight=train['weight']) logger.info("Model {} trained!".format(model)) @@ -223,7 +224,8 @@ def train_reco(train, custom_config=None): reg_energy = model(**energy_regression_args) reg_energy.fit(train[energy_features], - train['log_mc_energy']) + train['log_mc_energy'], sample_weight = train['weight'] + ) logger.info("Random Forest trained!") logger.info("Given disp_features: ", disp_features) @@ -231,7 +233,8 @@ def train_reco(train, custom_config=None): reg_disp = RandomForestRegressor(**disp_regression_args) reg_disp.fit(train[disp_features], - train['disp_norm']) + train['disp_norm'], + sample_weight=train['weight']) logger.info("Random Forest trained!") logger.info("Done!") @@ -267,7 +270,8 @@ def train_sep(train, custom_config=None): clf = model(**classification_args) clf.fit(train[features], - train['mc_type']) + train['mc_type'], + sample_weight=train['weight']) logger.info("Random Forest trained!") return clf @@ -345,6 +349,12 @@ def build_models(filegammas, fileprotons, # Adding a filter on mc_type just for training events_filters['mc_type'] = [-9000, np.inf] + pointing_wise_weights = False + if 'random_forest_weight_settings' in config: + if config['random_forest_weight_settings']['pointing_wise_weights']: + logger.info("Pointing-wise event weighting activated") + pointing_wise_weights = True + df_gamma = pd.read_hdf(filegammas, key=dl1_params_lstcam_key) df_proton = pd.read_hdf(fileprotons, key=dl1_params_lstcam_key) @@ -431,6 +441,14 @@ def build_models(filegammas, fileprotons, src_r_max = config['train_gamma_src_r_deg'][1] df_gamma = utils.apply_src_r_cut(df_gamma, src_r_min, src_r_max) + if pointing_wise_weights: + # Give same total weight to all events in every pointing node, by + # applying event-wise weights which depend on the statistics per node + # The weight is written to a new column of df_gamma, called 'weight' + _, _ = utils.compute_rf_event_weights(df_gamma) + else: + df_gamma['weight'] = np.ones(len(df_gamma)) + # Train regressors for energy and disp_norm reconstruction, only with gammas n_gamma_regressors = config["n_training_events"]["gamma_regressors"] if n_gamma_regressors not in [1.0, None]: @@ -491,6 +509,14 @@ def build_models(filegammas, fileprotons, "The requested number of protons for the classifier training is not valid." ) from e + if pointing_wise_weights: + # Give same total weight to all events in every pointing node, by + # applying event-wise weights which depend on the statistics per node + # The weight is written to a new column of df_gamma, called 'weight' + _, _ = utils.compute_rf_event_weights(df_proton) + else: + df_proton['weight'] = np.ones(len(df_proton)) + test = pd.concat([testg, df_proton], ignore_index=True) temp_reg_energy = train_energy(train, custom_config=config) diff --git a/lstchain/reco/tests/test_utils.py b/lstchain/reco/tests/test_utils.py index 5981186fdb..aaf1b66a59 100644 --- a/lstchain/reco/tests/test_utils.py +++ b/lstchain/reco/tests/test_utils.py @@ -237,3 +237,14 @@ def test_apply_src_r_cut(simulated_dl1_file): src_r_max = srcdep_config['train_gamma_src_r_deg'][1] params = apply_src_r_cut(params, src_r_min, src_r_max) assert (params.event_id.values == np.arange(100, 110, 1)).all() + +def test_compute_rf_event_weights(): + from lstchain.reco.utils import compute_rf_event_weights + + alt_tel = np.append(3*[0.41109], 6*[0.53508]) + az_tel = np.append(3*[1.32615], 6*[1.38195]) + + df = pd.DataFrame({"alt_tel": alt_tel, "az_tel": az_tel}) + _, _ = compute_rf_event_weights(df) + np.testing.assert_array_equal(df['weight'], + np.append(3*[4.5/3], 6*[4.5/6])) diff --git a/lstchain/reco/utils.py b/lstchain/reco/utils.py index 262976b243..b39b82cfe1 100644 --- a/lstchain/reco/utils.py +++ b/lstchain/reco/utils.py @@ -32,6 +32,7 @@ "cartesian_to_polar", "clip_alt", "compute_alpha", + "compute_rf_event_weights", "compute_theta2", "expand_tel_list", "extract_source_position", @@ -862,3 +863,74 @@ def get_events_in_GTI(events, CatB_cal_table): gti_mask = gti[events['calibration_id']] return events[gti_mask] + +def compute_rf_event_weights(events): + """ + Compute event-wise weights. Can be used for correcting for the different + statistics present in each pointing node of the MC training sample, + to avoid "jumps" in the performance of the random forests + + Parameters + ---------- + events : `~pd.DataFrame` + DL1 parameters dataframe. The table is modified in place by the + addition of a column called 'weight' (unless it exists already). The + column contains an event-wise weight to be used in the Random Forest + training, to give each of the telescope pointings in the training sample + the same overall weight in the training. + + Returns + ------- + + pointings: ndarray of shape (number_of_pointings, 2) Alt Az (in radians) + for each of the identified telescope pointings in the input MC sample + + weight_per_pointing: ndarray [number_of_pointings] weight for each of the + identified pointings. The weight is equal to the mean number of training + events per node divided by the number of training events in the specific + node. If used as sample_weight in scikit-learn, each node will have the + same total weight in the training of the Random Forests + + """ + + if 'weight' in events.columns: + log.warning("compute_rf_event_weights: DL2 table already contains") + log.warning("a column called weight. It will NOT be overwritten!") + return None, None + + # Add a 'weight' column to the input table + weights = np.array(np.ones(len(events))) + + # First identify existing telescope pointings in the sample: + # Convert to degrees and round to avoid potential + # rounding issues: + alt = np.round(events['alt_tel'], decimals=5) + az = np.round(events['az_tel'], decimals=5) + pointings = np.unique([alt, az], axis=1).T + + # Find the total statistics in each of the pointings: + stats = [] + for tel_alt_az in pointings: + mask = (np.isclose(tel_alt_az[0], events['alt_tel'], + atol=1e-5, rtol=0) & + np.isclose(tel_alt_az[1], events['az_tel'], + atol=1e-5, rtol=0)) + stats.append(mask.sum()) + + stats = np.array(stats) + weight_per_pointing = stats.mean() / stats + + # Now set the weights. + # Weight in a given node will be mean_events_per_node / n_events_in_node + + for ipointing, tel_alt_az in enumerate(pointings): + mask = (np.isclose(tel_alt_az[0], events['alt_tel'], + atol=1e-5, rtol=0) & + np.isclose(tel_alt_az[1], events['az_tel'], + atol=1e-5, rtol=0)) + weights[mask] = weight_per_pointing[ipointing] + + events['weight'] = weights + + # return the identified pointings and weights set (for checks) + return pointings, weight_per_pointing