Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 23 additions & 8 deletions src/graphnet/datasets/snowstorm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,20 @@


class SnowStormDataset(IceCubeHostedDataset):
"""IceCube SnowStorm simulation dataset.
"""IceCube SnowStorm Monte Carlo simulation dataset.

More information can be found at
https://wiki.icecube.wisc.edu/index.php/SnowStorm_MC#File_Locations
This is a IceCube Collaboration simulation dataset.
Requires a username and password.
This module provides access to the SnowStorm simulation data and prepares it
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not provide access to ALL SnowStorm simulations; it only provides access to a few run_ids (see the global variable AVAILABLE_RUN_IDS). Maybe clarify this in the comment :)

for the training and evaluation of deep learning models in GraphNeT by parsing
the data into the GraphNeT-compatible CuratedDataset format.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The data is already parsed in GraphNeT-compatible format (SQLite). The CuratedDataset format is just a convenience wrapper that makes creating Datasets and Dataloaders straightforward. Very minor detail, but to avoid confusio,n you could clarify this :)


The data is organized by SnowStorm RunIDs containing pulsemaps input features
along with event-level truth information.

The access to the data requires an IceCube Collaboration account.

References:
SnowStorm documentation: https://wiki.icecube.wisc.edu/index.php/SnowStorm_MC#File_Locations
SnowStorm paper: arXiv:1909.01530
"""

_experiment = "IceCube SnowStorm dataset"
Expand Down Expand Up @@ -91,7 +99,15 @@ def __init__(
def _prepare_args(
self, backend: str, features: List[str], truth: List[str]
) -> Tuple[Dict[str, Any], Union[List[int], None], Union[List[int], None]]:
"""Prepare arguments for dataset."""
"""Prepare arguments for dataset.

Args:
backend: backend of dataset. Only "sqlite" is supported.
features: List of features from user to use as input.
truth: List of event-level truth from user.

Returns: Dataset arguments, train/val selection, test selection
"""
assert backend == "sqlite"
dataset_paths = []
for rid in self._run_ids:
Expand All @@ -106,7 +122,6 @@ def _prepare_args(
# get RunID
pattern = rf"{re.escape(self.dataset_dir)}/(\d+)/.*"
event_counts: Dict[str, int] = {}
event_counts = {}
for path in dataset_paths:

# Extract the ID
Expand Down Expand Up @@ -175,7 +190,7 @@ def _create_comment(cls, event_counts: Dict[str, int] = {}) -> None:
runid_string += f"RunID {k} contains {v:10d} events\n"
tot += v
cls._comments = (
f"Contains ~{tot/1e6:.1f} million events:\n"
f"Contains ~{tot / 1e6:.1f} million events:\n"
+ runid_string
+ fixed_string
)
Expand Down
Loading