Skip to content

Commit

Permalink
Start re-implementing CMS datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
Radonirinaunimi committed Nov 28, 2023
1 parent e64d092 commit 5a2cdc0
Show file tree
Hide file tree
Showing 15 changed files with 2,275 additions and 1 deletion.
12 changes: 12 additions & 0 deletions buildmaster/CMS_WP_7TEV_ELECTRON/data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
data_central:
- 0.10200000000000001
- 0.111
- 0.116
- 0.123
- 0.133
- 0.136
- 0.156
- 0.166
- 0.197
- 0.224
- 0.21
334 changes: 334 additions & 0 deletions buildmaster/CMS_WP_7TEV_ELECTRON/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
import numpy as np
import pandas as pd
import pathlib
import yaml

from validphys.commondata_utils import covmat_to_artunc


MZ_VALUE = 91.1876 # GeV
MW_VALUE = 80.398 # GeV
SQRT_S = 7_000.0 # GeV
NORM_FACTOR = 1e-3 # Correct for Units

# Correct tables to read values [[W+/W-]]
TABLES = {1: [0]} # {table_id: [indexes]}
NB_POINTS = [11] # [N_W+/-]

# MAP Boson string to M values
MAP_BOSON = {"Z": MZ_VALUE, "W": MW_VALUE}
MAP_TABLE = {1: "W"}
MAP_TAB_UNC = {1: 2} # Map Data table to Unc. tables


def load_yaml(table_id: int, version: int = 1) -> dict:
"""Load the HEP data table in yaml format.
Parameters
----------
table_id: int
table ID number
Returns
-------
dict:
ditionary containing the table contents
"""
filename = f"HEPData-ins1118047-v{version}-Table_{table_id}"
table = pathlib.Path(f"./rawdata/{filename}.yaml")

return yaml.safe_load(table.read_text())


def read_metadata() -> tuple[int, int, list]:
"""Read the version and list of tables from metadata.
Returns
-------
tuple(int, list):
data version and list of hepdata tables
"""
metadata = pathlib.Path("./metadata.yaml")
content = yaml.safe_load(metadata.read_text())

version = content["hepdata"]["version"]
nb_datapoints = sum(content["implemented_observables"][0]["npoints"])
tables = content["implemented_observables"][0]["tables"]

return version, nb_datapoints, tables


def get_kinematics(hepdata: dict, bin_index: list, boson: str = "W") -> list:
"""Read the version and list of tables from metadata.
Parameters
----------
hepdata: dict
dictionary containing all data info
bin_index: list
list of Non-empty bin index
Returns
-------
tuple(int, list):
data version and list of hepdata tables
"""
rapbins = hepdata["independent_variables"][0]["values"]

kinematics = []
for bins in bin_index:
ymin = float(rapbins[bins]["low"])
ymax = float(rapbins[bins]["high"])
kin_value = {
"eta": {"min": ymin, "mid": 0.5 * (ymin + ymax), "max": ymax},
"M2": {"min": None, "mid": MAP_BOSON[boson] ** 2, "max": None},
"sqrt_s": {"min": None, "mid": SQRT_S, "max": None},
}
kinematics.append(kin_value)

return kinematics


def get_data_values(hepdata: dict, bin_index: list, indx: int = 0) -> list:
"""Extract the central values from the HepData yaml file.
Parameters
----------
hepdata: dict
dictionary containing all data info
bin_index: list
list of Non-empty bin index
idx: int
index from which to read the central value, default=0
Returns
-------
list:
list of dictionaries whose contents are the central values
"""
central = hepdata["dependent_variables"][indx]["values"]

return [NORM_FACTOR * central[i]["value"] for i in bin_index]


def get_errors(hepdata: dict, bin_index: list, indx: int = 0) -> dict:
"""Extract the error values from the HepData yaml file.
Parameters
----------
hepdata: dict
dictionary containing all data info
bin_index: list
list of Non-empty bin index
indx: int
index from which the errors will be read
Returns
-------
list:
list of dictionaries whose contents are the various
source of uncertainties
"""
errors = hepdata["dependent_variables"][indx]["values"]

stat, sys_corr = [], []
for idx in bin_index:
stat.append(NORM_FACTOR * errors[idx]["errors"][0]["symerror"])
sys_corr.append(NORM_FACTOR * errors[idx]["errors"][1]["symerror"])

return {"stat": stat, "sys_corr": sys_corr}


def read_corrmatrix(nb_datapoints: int, unc_hepdata: int) -> list:
"""Read the covariance matrix from a given HepData table.
Parameters
----------
nb_datapoints: int
total number of datapoints
unc_hepdata: int
HepData table ID corresponding to the covmat
Returns
-------
list: (nb_datapoints,)
entries of the corr/cov-mat as a list
"""
del nb_datapoints
unc_loaded = load_yaml(table_id=unc_hepdata)
df = pd.DataFrame(unc_loaded['dependent_variables'][0]['values'])

return df['value'].values.tolist()


def generate_artificial_unc(**kwargs) -> np.ndarray:
"""A wrapper around `covmat_to_artunc` to return an array.
Returns
-------
np.ndarray:
a squared matrix with artificial uncertainties
"""
artunc = covmat_to_artunc(**kwargs)
return np.array(artunc)


def concatenate_dicts(multidict: list[dict]) -> dict:
"""Given a list of dictionaries combined the values.
Parameters
----------
multidict: list[dict]
list of dictionaries with the same keys
Returns
-------
dict:
dictionary whose keys are combined
"""
new_dict = {}
for key in multidict[0].keys():
new_dict[key] = []
for element in multidict:
new_dict[key] += element[key]

return new_dict


def format_uncertainties(uncs: dict, artunc: np.ndarray) -> list:
"""Format the uncertainties to be dumped into the yaml file.
Parameters
----------
uncs: dict
dictionary containing the various source of uncertainties
artunc: no.ndarray
array of the artificial systematic uncertainties
Returns
-------
list:
list of ditionaries whose elements are the various errors
"""
combined_errors = []

artunc = artunc.tolist()
for idat in range(len(artunc)):
error_value = {}
for jdat, value in enumerate(artunc[idat]):
error_value[f"sys_corr_{jdat + 1}"] = value * NORM_FACTOR

error_value["stat"] = uncs["stat"][idat]
combined_errors.append(error_value)

return combined_errors


def dump_commondata(
kinematics: list, data: list, errors: list, nb_syscorr: int
) -> None:
"""Function that generates and writes the commondata files.
Parameters
----------
kinematics: list
list containing the kinematic values
data: list
list containing the central values
errors: list
list containing the different errors
nb_syscorr: int
number of source of Systematics
"""
error_definition = {
f"sys_corr_{i + 1}": {
"description": "Correlated systematic uncertainties",
"treatment": "ADD",
"type": "CORR",
}
for i in range(nb_syscorr)
}

error_definition["stat"] = {
"description": "Uncorrelated statistical uncertainties",
"treatment": "ADD",
"type": "UNCORR",
}

with open("data.yaml", "w") as file:
yaml.dump({"data_central": data}, file, sort_keys=False)

with open("kinematics.yaml", "w") as file:
yaml.dump({"bins": kinematics}, file, sort_keys=False)

with open("uncertainties.yaml", "w") as file:
yaml.dump(
{"definitions": error_definition, "bins": errors},
file,
sort_keys=False,
)


def main_filter() -> None:
"""Main driver of the filter that produces commmondata.
There are two main different sources of uncertainties.
1. Statistical uncertainties: ADD, UNCORR
2. Correlated Systematic uncertainties: ADD, CORR:
constructed from the covariance matrix provided with the
measurements
"""
version, _, _ = read_metadata()
nbpoints = sum(NB_POINTS)

nbp_idx = 0
comb_kins, comb_data = [], []
comb_covmat, combined_errors = [], []
for tabid in TABLES.keys(): # Loop over tables
for idx in TABLES[tabid]: # Loop over Bosons [W+/-]
bin_index = [i for i in range(NB_POINTS[nbp_idx])]
yaml_content = load_yaml(table_id=tabid, version=version)

# Extract the kinematic, data, and uncertainties
kinematics = get_kinematics(
yaml_content, bin_index, MAP_TABLE[tabid]
)
data_central = get_data_values(yaml_content, bin_index, indx=idx)
uncertainties = get_errors(yaml_content, bin_index, indx=idx)

# Collect all the results from different tables
comb_kins += kinematics
comb_data += data_central
comb_covmat += read_corrmatrix(
NB_POINTS[nbp_idx], MAP_TAB_UNC[tabid]
)
combined_errors.append(uncertainties)

nbp_idx += 1

errors_combined = concatenate_dicts(combined_errors)
# Compute the Artifical Systematics from CovMat
artunc = generate_artificial_unc(
ndata=nbpoints, covmat_list=comb_covmat, no_of_norm_mat=0
)
errors = format_uncertainties(errors_combined, artunc)

# Generate all the necessary files
dump_commondata(comb_kins, comb_data, errors, artunc.shape[-1])

return


if __name__ == "__main__":
main_filter()
Loading

0 comments on commit 5a2cdc0

Please sign in to comment.