scikit-hep · NJManganelli · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -5,7 +5,7 @@ The following pages are rendered jupyter notebooks that provide an overview and
 Each notebook builds on the previous one so it is recommended to go through them in order.
 
 .. toctree::
-
+   notebooks/filespec.ipynb
    notebooks/nanoevents.ipynb
    notebooks/applying_corrections.ipynb
    notebooks/packedselection.ipynb

diff --git a/docs/source/notebooks/filespec.ipynb b/docs/source/notebooks/filespec.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -58,6 +58,8 @@ dependencies = [
   "mplhep>=0.1.18",
   "packaging",
   "pandas",
+  "pydantic",
+  "eval_type_backport; python_version < '3.10'", #TODO: remove after python 3.9 EOL (filespec.py type Unions)
   "hist>=2",
   "cachetools",
   "requests",

diff --git a/src/coffea/dataset_tools/__init__.py b/src/coffea/dataset_tools/__init__.py
@@ -1,4 +1,17 @@
 from coffea.dataset_tools.apply_processor import apply_to_dataset, apply_to_fileset
+from coffea.dataset_tools.filespec import (
+    CoffeaParquetFileSpec,
+    CoffeaParquetFileSpecOptional,
+    CoffeaROOTFileSpec,
+    CoffeaROOTFileSpecOptional,
+    DatasetSpec,
+    FilesetSpec,
+    InputFiles,
+    IOFactory,
+    ParquetFileSpec,
+    PreprocessedFiles,
+    ROOTFileSpec,
+)
 from coffea.dataset_tools.manipulations import (
     filter_files,
     get_failed_steps_for_dataset,
@@ -23,4 +36,15 @@
     "slice_files",
     "get_failed_steps_for_dataset",
     "get_failed_steps_for_fileset",
+    "ROOTFileSpec",
+    "ParquetFileSpec",
+    "CoffeaROOTFileSpec",
+    "CoffeaROOTFileSpecOptional",
+    "CoffeaParquetFileSpec",
+    "CoffeaParquetFileSpecOptional",
+    "InputFiles",
+    "PreprocessedFiles",
+    "DatasetSpec",
+    "FilesetSpec",
+    "IOFactory",
 ]
diff --git a/src/coffea/dataset_tools/apply_processor.py b/src/coffea/dataset_tools/apply_processor.py
@@ -4,19 +4,15 @@
 from collections.abc import Hashable
 from typing import Any, Callable, Union
 
-import awkward
 import dask.base
 import dask_awkward
 
-from coffea.dataset_tools.preprocess import (
+from coffea.dataset_tools.filespec import (
     DatasetSpec,
-    DatasetSpecOptional,
     FilesetSpec,
-    FilesetSpecOptional,
 )
 from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory
 from coffea.processor import ProcessorABC
-from coffea.util import decompress_form
 
 DaskOutputBaseType = Union[
     dask.base.DaskMethodsMixin,
@@ -34,7 +30,7 @@
 
 def apply_to_dataset(
     data_manipulation: ProcessorABC | GenericHEPAnalysis,
-    dataset: DatasetSpec | DatasetSpecOptional,
+    dataset: DatasetSpec | dict,
     schemaclass: BaseSchema = NanoAODSchema,
     metadata: dict[Hashable, Any] = {},
     uproot_options: dict[str, Any] = {},
@@ -46,7 +42,7 @@ def apply_to_dataset(
     ----------
         data_manipulation : ProcessorABC or GenericHEPAnalysis
             The user analysis code to run on the input dataset
-        dataset: DatasetSpec | DatasetSpecOptional
+        dataset: DatasetSpec | dict
             The data to be acted upon by the data manipulation passed in.
         schemaclass: BaseSchema, default NanoAODSchema
             The nanoevents schema to interpret the input dataset with.
@@ -62,12 +58,12 @@ def apply_to_dataset(
         report : dask_awkward.Array, optional
             The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate.
     """
-    maybe_base_form = dataset.get("form", None)
-    if maybe_base_form is not None:
-        maybe_base_form = awkward.forms.from_json(decompress_form(maybe_base_form))
-    files = dataset["files"]
+    if isinstance(dataset, dict):
+        dataset = DatasetSpec.model_validate(dataset)
+    maybe_base_form = dataset.form
+    files = dataset.files
     events = NanoEventsFactory.from_root(
-        files,
+        files.model_dump(),
         metadata=metadata,
         schemaclass=schemaclass,
         known_base_form=maybe_base_form,
@@ -94,7 +90,7 @@ def apply_to_dataset(
 
 def apply_to_fileset(
     data_manipulation: ProcessorABC | GenericHEPAnalysis,
-    fileset: FilesetSpec | FilesetSpecOptional,
+    fileset: FilesetSpec | dict,
     schemaclass: BaseSchema = NanoAODSchema,
     uproot_options: dict[str, Any] = {},
 ) -> dict[str, DaskOutputType] | tuple[dict[str, DaskOutputType], dask_awkward.Array]:
@@ -105,7 +101,7 @@ def apply_to_fileset(
     ----------
         data_manipulation : ProcessorABC or GenericHEPAnalysis
             The user analysis code to run on the input dataset
-        fileset: FilesetSpec | FilesetSpecOptional
+        fileset: FilesetSpec
             The data to be acted upon by the data manipulation passed in. Metadata within the fileset should be dask-serializable.
         schemaclass: BaseSchema, default NanoAODSchema
             The nanoevents schema to interpret the input dataset with.
@@ -119,10 +115,12 @@ def apply_to_fileset(
         report : dask_awkward.Array, optional
             The file access report for running the analysis on the input dataset. Needs to be computed in simultaneously with the analysis to be accurate.
     """
+    if isinstance(fileset, dict):
+        fileset = FilesetSpec.model_validate(fileset)
     out = {}
     report = {}
     for name, dataset in fileset.items():
-        metadata = copy.deepcopy(dataset.get("metadata", {}))
+        metadata = copy.deepcopy(dataset.metadata)
         if metadata is None:
             metadata = {}
         metadata.setdefault("dataset", name)