From 72e24dc9edc74f12c4442573948012023ee19b72 Mon Sep 17 00:00:00 2001 From: venkat Date: Fri, 21 Nov 2025 21:09:42 +0530 Subject: [PATCH] test: add verification for HuggingFaceM4/InterleavedWebDocuments (closes #7394) --- tests/test_interleaved_web_documents.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/test_interleaved_web_documents.py diff --git a/tests/test_interleaved_web_documents.py b/tests/test_interleaved_web_documents.py new file mode 100644 index 00000000000..8ce673bd912 --- /dev/null +++ b/tests/test_interleaved_web_documents.py @@ -0,0 +1,21 @@ +import pytest +from datasets import load_dataset, get_dataset_config_names + +@pytest.mark.integration_test +def test_interleaved_web_documents(): + try: + get_dataset_config_names("HuggingFaceM4/InterleavedWebDocuments") + except Exception: + pytest.skip("Dataset HuggingFaceM4/InterleavedWebDocuments not yet available on the Hub") + + dataset = load_dataset("HuggingFaceM4/InterleavedWebDocuments", split="train[:5]") + assert len(dataset) == 5 + expected_features = {"url", "contents", "metadata"} + assert all(feature in dataset.features for feature in expected_features) + first = dataset[0] + assert isinstance(first["url"], str) + assert isinstance(first["contents"], list) + assert len(first["contents"]) > 0 + assert isinstance(first["contents"][0], dict) + assert "type" in first["contents"][0] + assert "value" in first["contents"][0]