-
Couldn't load subscription status.
- Fork 133
feat: Make preprocess rigorous with IOFactory and pydantic dataclasses #1398
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
28ca07e
08b6215
4fd87d7
f975f09
f147ac1
a927854
c073b51
60db77b
922602a
f519f0b
6485b57
bc93fb3
e599197
2fc17d3
e253bab
5fae595
8043c2f
48274b8
e22120c
41624e6
84d8c4e
41f435c
8951e59
fb801fa
7b36503
2fda9c6
2e08862
fbacdbb
be5d7e1
6e4f686
895049a
a69591f
4628c9f
7a38300
7ef4792
d095991
dc4e94e
96956a0
166f7bb
9750ea9
1ce5cd2
0711253
18abc3d
4358e44
364b9db
d43760a
123bcc9
efc8bfb
b8ba107
fa98172
6336ab9
3315090
164b06d
e97f987
69c236d
2e7fe6f
6893e06
1fabc7f
d05d888
08ad634
6d8a76d
7956a4e
bc7567f
456862c
eff15fc
23ee888
0ec2939
d622338
b34f4c7
ad2daa5
03b8d18
b0a2bee
ab0f265
80ffea4
71ba0ff
76ed528
c4393d2
7c21cff
c52e944
d3b2576
9c0162d
8466b0f
2f2beaf
3f09b66
ebeddfd
05d4e1b
5ed4532
0bad523
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,19 +4,15 @@ | |
| from collections.abc import Hashable | ||
| from typing import Any, Callable, Union | ||
|
|
||
| import awkward | ||
| import dask.base | ||
| import dask_awkward | ||
|
|
||
| from coffea.dataset_tools.preprocess import ( | ||
| from coffea.dataset_tools.filespec import ( | ||
| DatasetSpec, | ||
| DatasetSpecOptional, | ||
| FilesetSpec, | ||
| FilesetSpecOptional, | ||
| ) | ||
| from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory | ||
| from coffea.processor import ProcessorABC | ||
| from coffea.util import decompress_form | ||
|
|
||
| DaskOutputBaseType = Union[ | ||
| dask.base.DaskMethodsMixin, | ||
|
|
@@ -34,7 +30,7 @@ | |
|
|
||
| def apply_to_dataset( | ||
| data_manipulation: ProcessorABC | GenericHEPAnalysis, | ||
| dataset: DatasetSpec | DatasetSpecOptional, | ||
| dataset: DatasetSpec | dict, | ||
| schemaclass: BaseSchema = NanoAODSchema, | ||
| metadata: dict[Hashable, Any] = {}, | ||
| uproot_options: dict[str, Any] = {}, | ||
|
|
@@ -46,7 +42,7 @@ def apply_to_dataset( | |
| ---------- | ||
| data_manipulation : ProcessorABC or GenericHEPAnalysis | ||
| The user analysis code to run on the input dataset | ||
| dataset: DatasetSpec | DatasetSpecOptional | ||
| dataset: DatasetSpec | dict | ||
| The data to be acted upon by the data manipulation passed in. | ||
| schemaclass: BaseSchema, default NanoAODSchema | ||
| The nanoevents schema to interpret the input dataset with. | ||
|
|
@@ -62,12 +58,12 @@ def apply_to_dataset( | |
| report : dask_awkward.Array, optional | ||
| The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate. | ||
| """ | ||
| maybe_base_form = dataset.get("form", None) | ||
| if maybe_base_form is not None: | ||
| maybe_base_form = awkward.forms.from_json(decompress_form(maybe_base_form)) | ||
| files = dataset["files"] | ||
| if isinstance(dataset, dict): | ||
| dataset = DatasetSpec.model_validate(dataset) | ||
| maybe_base_form = dataset.form | ||
| files = dataset.files | ||
| events = NanoEventsFactory.from_root( | ||
| files, | ||
| files.model_dump(), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| metadata=metadata, | ||
| schemaclass=schemaclass, | ||
| known_base_form=maybe_base_form, | ||
|
|
@@ -94,7 +90,7 @@ def apply_to_dataset( | |
|
|
||
| def apply_to_fileset( | ||
| data_manipulation: ProcessorABC | GenericHEPAnalysis, | ||
| fileset: FilesetSpec | FilesetSpecOptional, | ||
| fileset: FilesetSpec | dict, | ||
| schemaclass: BaseSchema = NanoAODSchema, | ||
| uproot_options: dict[str, Any] = {}, | ||
| ) -> dict[str, DaskOutputType] | tuple[dict[str, DaskOutputType], dask_awkward.Array]: | ||
|
|
@@ -105,7 +101,7 @@ def apply_to_fileset( | |
| ---------- | ||
| data_manipulation : ProcessorABC or GenericHEPAnalysis | ||
| The user analysis code to run on the input dataset | ||
| fileset: FilesetSpec | FilesetSpecOptional | ||
| fileset: FilesetSpec | ||
| The data to be acted upon by the data manipulation passed in. Metadata within the fileset should be dask-serializable. | ||
| schemaclass: BaseSchema, default NanoAODSchema | ||
| The nanoevents schema to interpret the input dataset with. | ||
|
|
@@ -119,10 +115,12 @@ def apply_to_fileset( | |
| report : dask_awkward.Array, optional | ||
| The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate. | ||
| """ | ||
| if isinstance(fileset, dict): | ||
| fileset = FilesetSpec.model_validate(fileset) | ||
| out = {} | ||
| report = {} | ||
| for name, dataset in fileset.items(): | ||
| metadata = copy.deepcopy(dataset.get("metadata", {})) | ||
| metadata = copy.deepcopy(dataset.metadata) | ||
| if metadata is None: | ||
| metadata = {} | ||
| metadata.setdefault("dataset", name) | ||
NJManganelli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.