fix docstring code example for distributed shuffle (#7166)

huggingface · Sep 24, 2024 · e9ec56c · e9ec56c
1 parent 548d2d2
commit e9ec56c
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -5146,7 +5146,7 @@ def to_iterable_dataset(self, num_shards: Optional[int] = 1) -> "IterableDataset
         ```python
         >>> from datasets.distributed import split_dataset_by_node
         >>> ids = ds.to_iterable_dataset(num_shards=512)
-        >>> ids = ids.shuffle(buffer_size=10_000)  # will shuffle the shards order and use a shuffle buffer when you start iterating
+        >>> ids = ids.shuffle(buffer_size=10_000, seed=42)  # will shuffle the shards order and use a shuffle buffer when you start iterating
         >>> ids = split_dataset_by_node(ds, world_size=8, rank=0)  # will keep only 512 / 8 = 64 shards from the shuffled lists of shards when you start iterating
         >>> dataloader = torch.utils.data.DataLoader(ids, num_workers=4)  # will assign 64 / 4 = 16 shards from this node's list of shards to each worker when you start iterating
         >>> for example in ids: