From 3b6a28acd1f0501cab678c989d7ac392658194d1 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:26:02 -0500 Subject: [PATCH 01/12] feat(bids): add pybids optional dependency and config check --- setup.py | 3 +++ src/datasets/config.py | 1 + 2 files changed, 4 insertions(+) diff --git a/setup.py b/setup.py index 30d66fc54db..70042fd5a57 100644 --- a/setup.py +++ b/setup.py @@ -210,6 +210,8 @@ NIBABEL_REQUIRE = ["nibabel>=5.3.2", "ipyniivue==2.4.2"] +PYBIDS_REQUIRE = ["pybids>=0.21.0"] + EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, "vision": VISION_REQUIRE, @@ -228,6 +230,7 @@ "docs": DOCS_REQUIRE, "pdfs": PDFS_REQUIRE, "nibabel": NIBABEL_REQUIRE, + "bids": PYBIDS_REQUIRE, } setup( diff --git a/src/datasets/config.py b/src/datasets/config.py index b6412682727..2df571e4b8f 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -140,6 +140,7 @@ TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None +PYBIDS_AVAILABLE = importlib.util.find_spec("bids") is not None # Optional compression tools RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None From 53671b16d0b7906cd94e1f552b9612f2b987f303 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:26:02 -0500 Subject: [PATCH 02/12] test(bids): add synthetic BIDS dataset fixtures --- tests/packaged_modules/test_bids.py | 100 ++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 tests/packaged_modules/test_bids.py diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py new file mode 100644 index 00000000000..90f72d7500f --- /dev/null +++ b/tests/packaged_modules/test_bids.py @@ -0,0 +1,100 @@ +import json +import pytest +import numpy as np +import datasets.config + +@pytest.fixture +def minimal_bids_dataset(tmp_path): + """Minimal valid BIDS dataset with one subject, one T1w scan.""" + # dataset_description.json (required) + (tmp_path / "dataset_description.json").write_text(json.dumps({ + "Name": "Test BIDS Dataset", + "BIDSVersion": "1.10.1" + })) + + # Create subject/anat folder + anat_dir = tmp_path / "sub-01" / "anat" + anat_dir.mkdir(parents=True) + + # Create dummy NIfTI + if datasets.config.NIBABEL_AVAILABLE: + import nibabel as nib + data = np.zeros((4, 4, 4), dtype=np.float32) + img = nib.Nifti1Image(data, np.eye(4)) + nib.save(img, str(anat_dir / "sub-01_T1w.nii.gz")) + else: + # Fallback if nibabel not available (shouldn't happen in test env ideally) + (anat_dir / "sub-01_T1w.nii.gz").write_bytes(b"DUMMY NIFTI CONTENT") + + # JSON sidecar + (anat_dir / "sub-01_T1w.json").write_text(json.dumps({"RepetitionTime": 2.0})) + + return str(tmp_path) + + +@pytest.fixture +def multi_subject_bids(tmp_path): + """BIDS dataset with multiple subjects and sessions.""" + (tmp_path / "dataset_description.json").write_text(json.dumps({ + "Name": "Multi-Subject Test", + "BIDSVersion": "1.10.1" + })) + + data = np.zeros((4, 4, 4), dtype=np.float32) + + if datasets.config.NIBABEL_AVAILABLE: + import nibabel as nib + else: + nib = None + + for sub in ["01", "02"]: + for ses in ["baseline", "followup"]: + anat_dir = tmp_path / f"sub-{sub}" / f"ses-{ses}" / "anat" + anat_dir.mkdir(parents=True) + + file_path = anat_dir / f"sub-{sub}_ses-{ses}_T1w.nii.gz" + if nib: + img = nib.Nifti1Image(data, np.eye(4)) + nib.save(img, str(file_path)) + else: + file_path.write_bytes(b"DUMMY NIFTI CONTENT") + + (anat_dir / f"sub-{sub}_ses-{ses}_T1w.json").write_text( + return str(tmp_path) + + +def test_bids_module_imports(): + from datasets.packaged_modules.bids import Bids, BidsConfig + assert Bids is not None + assert BidsConfig is not None + + +def test_bids_requires_pybids(monkeypatch): + """Test helpful error when pybids not installed.""" + from datasets.packaged_modules.bids.bids import Bids + import datasets.config + + monkeypatch.setattr(datasets.config, "PYBIDS_AVAILABLE", False) + + builder = Bids() + with pytest.raises(ImportError, match="pybids"): + builder._info() + + +@pytest.mark.skipif( + not datasets.config.PYBIDS_AVAILABLE, + reason="pybids not installed" +) +def test_bids_loads_single_subject(minimal_bids_dataset): + from datasets import load_dataset + + ds = load_dataset("bids", data_dir=minimal_bids_dataset, trust_remote_code=True) + + assert "train" in ds + assert len(ds["train"]) == 1 + + sample = ds["train"][0] + assert sample["subject"] == "01" + assert sample["suffix"] == "T1w" + assert sample["datatype"] == "anat" + assert sample["session"] is None From fdef1fb852e74c8d244ef828e5e8371f6c50b9bf Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:26:02 -0500 Subject: [PATCH 03/12] feat(bids): implement basic BIDS loader module --- src/datasets/packaged_modules/__init__.py | 2 + .../packaged_modules/bids/__init__.py | 1 + src/datasets/packaged_modules/bids/bids.py | 84 +++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 src/datasets/packaged_modules/bids/__init__.py create mode 100644 src/datasets/packaged_modules/bids/bids.py diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index c9a32ff71f0..9655dffdc10 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -6,6 +6,7 @@ from .arrow import arrow from .audiofolder import audiofolder +from .bids import bids from .cache import cache from .csv import csv from .eval import eval @@ -49,6 +50,7 @@ def _hash_python_lines(lines: list[str]) -> str: "videofolder": (videofolder.__name__, _hash_python_lines(inspect.getsource(videofolder).splitlines())), "pdffolder": (pdffolder.__name__, _hash_python_lines(inspect.getsource(pdffolder).splitlines())), "niftifolder": (niftifolder.__name__, _hash_python_lines(inspect.getsource(niftifolder).splitlines())), + "bids": (bids.__name__, _hash_python_lines(inspect.getsource(bids).splitlines())), "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())), "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())), "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())), diff --git a/src/datasets/packaged_modules/bids/__init__.py b/src/datasets/packaged_modules/bids/__init__.py new file mode 100644 index 00000000000..1d167b51030 --- /dev/null +++ b/src/datasets/packaged_modules/bids/__init__.py @@ -0,0 +1 @@ +from .bids import Bids, BidsConfig diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py new file mode 100644 index 00000000000..6d1bcaf277c --- /dev/null +++ b/src/datasets/packaged_modules/bids/bids.py @@ -0,0 +1,84 @@ +import json +from dataclasses import dataclass +from typing import Optional + +import datasets +from datasets import config + + +logger = datasets.utils.logging.get_logger(__name__) + + +@dataclass +class BidsConfig(datasets.BuilderConfig): + """BuilderConfig for BIDS datasets.""" + data_dir: Optional[str] = None + database_path: Optional[str] = None # For pybids caching + + +class Bids(datasets.GeneratorBasedBuilder): + """BIDS dataset loader using pybids.""" + + BUILDER_CONFIG_CLASS = BidsConfig + + def _info(self): + if not config.PYBIDS_AVAILABLE: + raise ImportError( + "To load BIDS datasets, please install pybids: pip install pybids" + ) + + return datasets.DatasetInfo( + features=datasets.Features({ + "subject": datasets.Value("string"), + "session": datasets.Value("string"), + "datatype": datasets.Value("string"), + "suffix": datasets.Value("string"), + "task": datasets.Value("string"), + "run": datasets.Value("string"), + "path": datasets.Value("string"), + "nifti": datasets.Nifti(), + "metadata": datasets.Value("string"), + }) + ) + + def _split_generators(self, dl_manager): + from bids import BIDSLayout + + if not self.config.data_dir: + raise ValueError("data_dir is required for BIDS datasets") + + layout = BIDSLayout( + self.config.data_dir, + database_path=self.config.database_path, + validate=False, # Don't fail on minor validation issues + ) + + # Get all NIfTI files + nifti_files = layout.get(extension=[".nii", ".nii.gz"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"layout": layout, "files": nifti_files}, + ) + ] + + def _generate_examples(self, layout, files): + for idx, bids_file in enumerate(files): + entities = bids_file.get_entities() + + # Get JSON sidecar metadata + metadata = layout.get_metadata(bids_file.path) + metadata_str = json.dumps(metadata) if metadata else "{}" + + yield idx, { + "subject": entities.get("subject"), + "session": entities.get("session"), + "datatype": entities.get("datatype"), + "suffix": entities.get("suffix"), + "task": entities.get("task"), + "run": str(entities.get("run")) if entities.get("run") else None, + "path": bids_file.path, + "nifti": bids_file.path, + "metadata": metadata_str, + } \ No newline at end of file From 0c52a9ec3f2862f69aff8cb49574bd4c4d80dca9 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:26:31 -0500 Subject: [PATCH 04/12] fix(test): repair syntax in BIDS test fixture --- tests/packaged_modules/test_bids.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index 90f72d7500f..17b7112c94f 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -60,6 +60,9 @@ def multi_subject_bids(tmp_path): file_path.write_bytes(b"DUMMY NIFTI CONTENT") (anat_dir / f"sub-{sub}_ses-{ses}_T1w.json").write_text( + json.dumps({"RepetitionTime": 2.0}) + ) + return str(tmp_path) From e2e38067681a23fd16cb62f803eae96849815be1 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:27:17 -0500 Subject: [PATCH 05/12] fix(test): handle Bids init exception --- tests/packaged_modules/test_bids.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index 17b7112c94f..42e60bbf35b 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -79,9 +79,8 @@ def test_bids_requires_pybids(monkeypatch): monkeypatch.setattr(datasets.config, "PYBIDS_AVAILABLE", False) - builder = Bids() with pytest.raises(ImportError, match="pybids"): - builder._info() + Bids() @pytest.mark.skipif( From b3b850641746c5517d1daec17aa9ecb26ff46b09 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:27:49 -0500 Subject: [PATCH 06/12] feat(bids): add subject/session/datatype filtering --- src/datasets/packaged_modules/bids/bids.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py index 6d1bcaf277c..56489fb136d 100644 --- a/src/datasets/packaged_modules/bids/bids.py +++ b/src/datasets/packaged_modules/bids/bids.py @@ -14,6 +14,9 @@ class BidsConfig(datasets.BuilderConfig): """BuilderConfig for BIDS datasets.""" data_dir: Optional[str] = None database_path: Optional[str] = None # For pybids caching + subjects: Optional[list[str]] = None # Filter by subject + sessions: Optional[list[str]] = None # Filter by session + datatypes: Optional[list[str]] = None # Filter by datatype class Bids(datasets.GeneratorBasedBuilder): @@ -53,8 +56,17 @@ def _split_generators(self, dl_manager): validate=False, # Don't fail on minor validation issues ) + # Build query kwargs + query = {"extension": [".nii", ".nii.gz"]} + if self.config.subjects: + query["subject"] = self.config.subjects + if self.config.sessions: + query["session"] = self.config.sessions + if self.config.datatypes: + query["datatype"] = self.config.datatypes + # Get all NIfTI files - nifti_files = layout.get(extension=[".nii", ".nii.gz"]) + nifti_files = layout.get(**query) return [ datasets.SplitGenerator( From fbf307bb1af453ebd90d3a54c915522e363d7b5a Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:27:49 -0500 Subject: [PATCH 07/12] test(bids): add multi-subject filtering test --- tests/packaged_modules/test_bids.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index 42e60bbf35b..ee2a4608672 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -100,3 +100,21 @@ def test_bids_loads_single_subject(minimal_bids_dataset): assert sample["suffix"] == "T1w" assert sample["datatype"] == "anat" assert sample["session"] is None + + +@pytest.mark.skipif( + not datasets.config.PYBIDS_AVAILABLE, + reason="pybids not installed" +) +def test_bids_multi_subject(multi_subject_bids): + from datasets import load_dataset + + ds = load_dataset("bids", data_dir=multi_subject_bids, trust_remote_code=True) + + assert len(ds["train"]) == 4 # 2 subjects × 2 sessions + + subjects = set(sample["subject"] for sample in ds["train"]) + assert subjects == {"01", "02"} + + sessions = set(sample["session"] for sample in ds["train"]) + assert sessions == {"baseline", "followup"} From 7320ba336335baf29e70b26ae0b4e4901c1b2769 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:28:16 -0500 Subject: [PATCH 08/12] feat(bids): add validation and error handling --- src/datasets/packaged_modules/bids/bids.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py index 56489fb136d..d3918c9774f 100644 --- a/src/datasets/packaged_modules/bids/bids.py +++ b/src/datasets/packaged_modules/bids/bids.py @@ -45,11 +45,21 @@ def _info(self): ) def _split_generators(self, dl_manager): + import os from bids import BIDSLayout if not self.config.data_dir: raise ValueError("data_dir is required for BIDS datasets") + if not os.path.isdir(self.config.data_dir): + raise ValueError(f"data_dir does not exist: {self.config.data_dir}") + + desc_file = os.path.join(self.config.data_dir, "dataset_description.json") + if not os.path.exists(desc_file): + raise ValueError( + f"Not a valid BIDS dataset: missing dataset_description.json in {self.config.data_dir}" + ) + layout = BIDSLayout( self.config.data_dir, database_path=self.config.database_path, @@ -68,6 +78,12 @@ def _split_generators(self, dl_manager): # Get all NIfTI files nifti_files = layout.get(**query) + if not nifti_files: + logger.warning( + f"No NIfTI files found in {self.config.data_dir} with filters: {query}. " + "Check that the dataset is valid BIDS and filters match existing data." + ) + return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, From f3fe6f996d5d69738ac34ce5d6072620ec397ff3 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:29:48 -0500 Subject: [PATCH 09/12] docs(bids): add BIDS loading guide --- docs/source/_toctree.yml | 2 ++ docs/source/bids_dataset.mdx | 63 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 docs/source/bids_dataset.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index cc6b7195fe2..58189f1fd29 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -90,6 +90,8 @@ title: Create a document dataset - local: nifti_dataset title: Create a medical imaging dataset + - local: bids_dataset + title: Load a BIDS dataset title: "Vision" - sections: - local: nlp_load diff --git a/docs/source/bids_dataset.mdx b/docs/source/bids_dataset.mdx new file mode 100644 index 00000000000..62ca79ae770 --- /dev/null +++ b/docs/source/bids_dataset.mdx @@ -0,0 +1,63 @@ +# BIDS Dataset + +[BIDS (Brain Imaging Data Structure)](https://bids.neuroimaging.io/) is a standard for organizing and describing neuroimaging and behavioral data. The `datasets` library supports loading BIDS datasets directly, leveraging `pybids` for parsing and `nibabel` for handling NIfTI files. + + + +To use the BIDS loader, you need to install the `bids` extra: + +```bash +pip install datasets[bids] +``` + + + +## Loading a BIDS Dataset + +You can load a BIDS dataset by pointing to its root directory (containing `dataset_description.json`): + +```python +from datasets import load_dataset + +# Load a local BIDS dataset +ds = load_dataset("bids", data_dir="/path/to/bids/dataset") + +# Access the first example +print(ds["train"][0]) +# { +# 'subject': '01', +# 'session': 'baseline', +# 'datatype': 'anat', +# 'suffix': 'T1w', +# 'nifti': , +# ... +# } +``` + +The `nifti` column contains `nibabel` image objects, which can be visualized interactively in Jupyter notebooks. + +## Filtering + +You can filter the dataset by BIDS entities like `subject`, `session`, and `datatype` when loading: + +```python +# Load only specific subjects and datatypes +ds = load_dataset( + "bids", + data_dir="/path/to/bids/dataset", + subjects=["01", "05", "10"], + sessions=["pre", "post"], + datatypes=["func"], +) +``` + +## Metadata + +BIDS datasets often include JSON sidecar files with metadata (e.g., scanner parameters). This metadata is loaded into the `metadata` column as a JSON string. + +```python +import json + +metadata = json.loads(ds["train"][0]["metadata"]) +print(metadata["RepetitionTime"]) +``` From d888f8a326363f504904404737a0ff5575b4c7dd Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:38:26 -0500 Subject: [PATCH 10/12] fix(bids): lint and format fixes, remove deprecated trust_remote_code - Remove deprecated `trust_remote_code=True` from tests (not needed for packaged modules) - Fix ruff linting errors (import sorting, trailing newlines) - Apply ruff formatter for consistent code style - Convert set() generators to set comprehensions (C401) --- src/datasets/packaged_modules/bids/bids.py | 59 ++++++++++++---------- tests/packaged_modules/test_bids.py | 51 +++++++++---------- 2 files changed, 54 insertions(+), 56 deletions(-) diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py index d3918c9774f..278828c5e58 100644 --- a/src/datasets/packaged_modules/bids/bids.py +++ b/src/datasets/packaged_modules/bids/bids.py @@ -12,6 +12,7 @@ @dataclass class BidsConfig(datasets.BuilderConfig): """BuilderConfig for BIDS datasets.""" + data_dir: Optional[str] = None database_path: Optional[str] = None # For pybids caching subjects: Optional[list[str]] = None # Filter by subject @@ -26,26 +27,27 @@ class Bids(datasets.GeneratorBasedBuilder): def _info(self): if not config.PYBIDS_AVAILABLE: - raise ImportError( - "To load BIDS datasets, please install pybids: pip install pybids" - ) + raise ImportError("To load BIDS datasets, please install pybids: pip install pybids") return datasets.DatasetInfo( - features=datasets.Features({ - "subject": datasets.Value("string"), - "session": datasets.Value("string"), - "datatype": datasets.Value("string"), - "suffix": datasets.Value("string"), - "task": datasets.Value("string"), - "run": datasets.Value("string"), - "path": datasets.Value("string"), - "nifti": datasets.Nifti(), - "metadata": datasets.Value("string"), - }) + features=datasets.Features( + { + "subject": datasets.Value("string"), + "session": datasets.Value("string"), + "datatype": datasets.Value("string"), + "suffix": datasets.Value("string"), + "task": datasets.Value("string"), + "run": datasets.Value("string"), + "path": datasets.Value("string"), + "nifti": datasets.Nifti(), + "metadata": datasets.Value("string"), + } + ) ) def _split_generators(self, dl_manager): import os + from bids import BIDSLayout if not self.config.data_dir: @@ -56,9 +58,7 @@ def _split_generators(self, dl_manager): desc_file = os.path.join(self.config.data_dir, "dataset_description.json") if not os.path.exists(desc_file): - raise ValueError( - f"Not a valid BIDS dataset: missing dataset_description.json in {self.config.data_dir}" - ) + raise ValueError(f"Not a valid BIDS dataset: missing dataset_description.json in {self.config.data_dir}") layout = BIDSLayout( self.config.data_dir, @@ -99,14 +99,17 @@ def _generate_examples(self, layout, files): metadata = layout.get_metadata(bids_file.path) metadata_str = json.dumps(metadata) if metadata else "{}" - yield idx, { - "subject": entities.get("subject"), - "session": entities.get("session"), - "datatype": entities.get("datatype"), - "suffix": entities.get("suffix"), - "task": entities.get("task"), - "run": str(entities.get("run")) if entities.get("run") else None, - "path": bids_file.path, - "nifti": bids_file.path, - "metadata": metadata_str, - } \ No newline at end of file + yield ( + idx, + { + "subject": entities.get("subject"), + "session": entities.get("session"), + "datatype": entities.get("datatype"), + "suffix": entities.get("suffix"), + "task": entities.get("task"), + "run": str(entities.get("run")) if entities.get("run") else None, + "path": bids_file.path, + "nifti": bids_file.path, + "metadata": metadata_str, + }, + ) diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index ee2a4608672..03ac2043e6c 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -1,16 +1,18 @@ import json -import pytest + import numpy as np +import pytest + import datasets.config + @pytest.fixture def minimal_bids_dataset(tmp_path): """Minimal valid BIDS dataset with one subject, one T1w scan.""" # dataset_description.json (required) - (tmp_path / "dataset_description.json").write_text(json.dumps({ - "Name": "Test BIDS Dataset", - "BIDSVersion": "1.10.1" - })) + (tmp_path / "dataset_description.json").write_text( + json.dumps({"Name": "Test BIDS Dataset", "BIDSVersion": "1.10.1"}) + ) # Create subject/anat folder anat_dir = tmp_path / "sub-01" / "anat" @@ -19,6 +21,7 @@ def minimal_bids_dataset(tmp_path): # Create dummy NIfTI if datasets.config.NIBABEL_AVAILABLE: import nibabel as nib + data = np.zeros((4, 4, 4), dtype=np.float32) img = nib.Nifti1Image(data, np.eye(4)) nib.save(img, str(anat_dir / "sub-01_T1w.nii.gz")) @@ -35,13 +38,12 @@ def minimal_bids_dataset(tmp_path): @pytest.fixture def multi_subject_bids(tmp_path): """BIDS dataset with multiple subjects and sessions.""" - (tmp_path / "dataset_description.json").write_text(json.dumps({ - "Name": "Multi-Subject Test", - "BIDSVersion": "1.10.1" - })) + (tmp_path / "dataset_description.json").write_text( + json.dumps({"Name": "Multi-Subject Test", "BIDSVersion": "1.10.1"}) + ) data = np.zeros((4, 4, 4), dtype=np.float32) - + if datasets.config.NIBABEL_AVAILABLE: import nibabel as nib else: @@ -51,31 +53,30 @@ def multi_subject_bids(tmp_path): for ses in ["baseline", "followup"]: anat_dir = tmp_path / f"sub-{sub}" / f"ses-{ses}" / "anat" anat_dir.mkdir(parents=True) - + file_path = anat_dir / f"sub-{sub}_ses-{ses}_T1w.nii.gz" if nib: img = nib.Nifti1Image(data, np.eye(4)) nib.save(img, str(file_path)) else: file_path.write_bytes(b"DUMMY NIFTI CONTENT") - - (anat_dir / f"sub-{sub}_ses-{ses}_T1w.json").write_text( - json.dumps({"RepetitionTime": 2.0}) - ) + + (anat_dir / f"sub-{sub}_ses-{ses}_T1w.json").write_text(json.dumps({"RepetitionTime": 2.0})) return str(tmp_path) def test_bids_module_imports(): from datasets.packaged_modules.bids import Bids, BidsConfig + assert Bids is not None assert BidsConfig is not None def test_bids_requires_pybids(monkeypatch): """Test helpful error when pybids not installed.""" - from datasets.packaged_modules.bids.bids import Bids import datasets.config + from datasets.packaged_modules.bids.bids import Bids monkeypatch.setattr(datasets.config, "PYBIDS_AVAILABLE", False) @@ -83,14 +84,11 @@ def test_bids_requires_pybids(monkeypatch): Bids() -@pytest.mark.skipif( - not datasets.config.PYBIDS_AVAILABLE, - reason="pybids not installed" -) +@pytest.mark.skipif(not datasets.config.PYBIDS_AVAILABLE, reason="pybids not installed") def test_bids_loads_single_subject(minimal_bids_dataset): from datasets import load_dataset - ds = load_dataset("bids", data_dir=minimal_bids_dataset, trust_remote_code=True) + ds = load_dataset("bids", data_dir=minimal_bids_dataset) assert "train" in ds assert len(ds["train"]) == 1 @@ -102,19 +100,16 @@ def test_bids_loads_single_subject(minimal_bids_dataset): assert sample["session"] is None -@pytest.mark.skipif( - not datasets.config.PYBIDS_AVAILABLE, - reason="pybids not installed" -) +@pytest.mark.skipif(not datasets.config.PYBIDS_AVAILABLE, reason="pybids not installed") def test_bids_multi_subject(multi_subject_bids): from datasets import load_dataset - ds = load_dataset("bids", data_dir=multi_subject_bids, trust_remote_code=True) + ds = load_dataset("bids", data_dir=multi_subject_bids) assert len(ds["train"]) == 4 # 2 subjects × 2 sessions - subjects = set(sample["subject"] for sample in ds["train"]) + subjects = {sample["subject"] for sample in ds["train"]} assert subjects == {"01", "02"} - sessions = set(sample["session"] for sample in ds["train"]) + sessions = {sample["session"] for sample in ds["train"]} assert sessions == {"baseline", "followup"} From dda58c0308e254af1cd76f6b327ac114b4ee0e26 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 20:06:59 -0500 Subject: [PATCH 11/12] fix(bids): apply CodeRabbit feedback - Update setup.py to include nibabel in BIDS extra - Update docs to clarify nibabel is included - Add nibabel availability check in _info() - Move os import to module level - Update test skipif to check both pybids and nibabel --- docs/source/bids_dataset.mdx | 2 +- setup.py | 2 +- src/datasets/packaged_modules/bids/bids.py | 5 +++-- tests/packaged_modules/test_bids.py | 11 ++++++++--- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/source/bids_dataset.mdx b/docs/source/bids_dataset.mdx index 62ca79ae770..89ca31e566e 100644 --- a/docs/source/bids_dataset.mdx +++ b/docs/source/bids_dataset.mdx @@ -4,7 +4,7 @@ -To use the BIDS loader, you need to install the `bids` extra: +To use the BIDS loader, you need to install the `bids` extra (which installs `pybids` and `nibabel`): ```bash pip install datasets[bids] diff --git a/setup.py b/setup.py index 70042fd5a57..42dd5c101b8 100644 --- a/setup.py +++ b/setup.py @@ -210,7 +210,7 @@ NIBABEL_REQUIRE = ["nibabel>=5.3.2", "ipyniivue==2.4.2"] -PYBIDS_REQUIRE = ["pybids>=0.21.0"] +PYBIDS_REQUIRE = ["pybids>=0.21.0"] + NIBABEL_REQUIRE EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py index 278828c5e58..d165218de4d 100644 --- a/src/datasets/packaged_modules/bids/bids.py +++ b/src/datasets/packaged_modules/bids/bids.py @@ -1,4 +1,5 @@ import json +import os from dataclasses import dataclass from typing import Optional @@ -28,6 +29,8 @@ class Bids(datasets.GeneratorBasedBuilder): def _info(self): if not config.PYBIDS_AVAILABLE: raise ImportError("To load BIDS datasets, please install pybids: pip install pybids") + if not config.NIBABEL_AVAILABLE: + raise ImportError("To load BIDS datasets, please install nibabel: pip install nibabel") return datasets.DatasetInfo( features=datasets.Features( @@ -46,8 +49,6 @@ def _info(self): ) def _split_generators(self, dl_manager): - import os - from bids import BIDSLayout if not self.config.data_dir: diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index 03ac2043e6c..8ce2be9b72b 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -75,7 +75,6 @@ def test_bids_module_imports(): def test_bids_requires_pybids(monkeypatch): """Test helpful error when pybids not installed.""" - import datasets.config from datasets.packaged_modules.bids.bids import Bids monkeypatch.setattr(datasets.config, "PYBIDS_AVAILABLE", False) @@ -84,7 +83,10 @@ def test_bids_requires_pybids(monkeypatch): Bids() -@pytest.mark.skipif(not datasets.config.PYBIDS_AVAILABLE, reason="pybids not installed") +@pytest.mark.skipif( + not datasets.config.PYBIDS_AVAILABLE or not datasets.config.NIBABEL_AVAILABLE, + reason="pybids or nibabel not installed", +) def test_bids_loads_single_subject(minimal_bids_dataset): from datasets import load_dataset @@ -100,7 +102,10 @@ def test_bids_loads_single_subject(minimal_bids_dataset): assert sample["session"] is None -@pytest.mark.skipif(not datasets.config.PYBIDS_AVAILABLE, reason="pybids not installed") +@pytest.mark.skipif( + not datasets.config.PYBIDS_AVAILABLE or not datasets.config.NIBABEL_AVAILABLE, + reason="pybids or nibabel not installed", +) def test_bids_multi_subject(multi_subject_bids): from datasets import load_dataset From 599d670ae3299cb1306d7af236be84843b972111 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Sat, 29 Nov 2025 09:45:55 -0500 Subject: [PATCH 12/12] chore: trigger CI