apify · Rutam21 · Oct 25, 2024 · Oct 29, 2024
diff --git a/docs/examples/code/export_entire_dataset_to_file_csv.py b/docs/examples/code/export_entire_dataset_to_file_csv.py
@@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     await crawler.run(['https://crawlee.dev'])
 
     # Export the entire dataset to a CSV file.
-    await crawler.export_data('results.csv')
+    await crawler.export_data_csv(path='results.csv')
 
 
 if __name__ == '__main__':

diff --git a/docs/examples/code/export_entire_dataset_to_file_json.py b/docs/examples/code/export_entire_dataset_to_file_json.py
@@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     await crawler.run(['https://crawlee.dev'])
 
     # Export the entire dataset to a JSON file.
-    await crawler.export_data('results.json')
+    await crawler.export_data_json(path='results.json')
 
 
 if __name__ == '__main__':

diff --git a/docs/examples/code/parsel_crawler.py b/docs/examples/code/parsel_crawler.py
@@ -34,7 +34,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:
     await crawler.run(['https://github.com'])
 
     # Export the entire dataset to a JSON file.
-    await crawler.export_data('results.json')
+    await crawler.export_data_json(path='results.json')
 
 
 if __name__ == '__main__':

diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py
@@ -55,7 +55,7 @@
     from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
     from crawlee.sessions import Session
     from crawlee.statistics import FinalStatistics, StatisticsState
-    from crawlee.storages._dataset import GetDataKwargs, PushDataKwargs
+    from crawlee.storages._dataset import GetDataKwargs, PushDataKwargs, ExportDataCsvKwargs, ExportDataJsonKwargs
     from crawlee.storages._request_provider import RequestProvider
 
 TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
@@ -488,10 +488,18 @@ async def get_data(
     async def export_data(
         self,
         path: str | Path,
-        content_type: Literal['json', 'csv'] | None = None,
+        *,
+        content_type: Literal['csv', 'json'] = 'csv',
         dataset_id: str | None = None,
         dataset_name: str | None = None,
+        **kwargs: Any,
     ) -> None:
+        if content_type == 'csv':
+            self.export_data_csv(path, dataset_id=dataset_id, dataset_name=dataset_name, **kwargs)
+        elif content_type == 'json':
+            self.export_data_json(path, dataset_id=dataset_id, dataset_name=dataset_name, **kwargs)
+        else:
+            raise ValueError(f'Unsupported content type: {content_type}.')
         """Export data from a dataset.
 
         This helper method simplifies the process of exporting data from a dataset. It opens the specified

diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py
@@ -77,6 +77,32 @@ class ExportToKwargs(TypedDict):
     """Name of the key-value store to save the exported file."""
 
 
+class ExportDataJsonKwargs(TypedDict):
+    """Keyword arguments for dataset's `export_data_json` method.
+    Args:
+        ensure_asci: Whether non-ASCII characters should be escaped in the output JSON string.
+        indent: Specifies the number of spaces to use for indentation in the pretty-printed JSON output.
+        sort_keys: Specifies whether the output JSON object should have its keys sorted alphabetically.
+    """
+
+    ensure_asci: NotRequired[bool]
+    indent: NotRequired[int]
+    sort_keys: NotRequired[bool]
+
+
+class ExportDataCsvKwargs(TypedDict):
+    """Keyword arguments for dataset's `export_data_csv` method.
+    Args:
+        delimiter: A character that separates fields in the CSV file.
+        quotechar: A character used to enclose fields containing special characters like the delimiter.
+        quoting: An integer that defines how quotes should be applied.
+    """
+
+    delimiter: NotRequired[str]
+    quotechar: NotRequired[str]
+    quoting: NotRequired[int]
+
+
 class Dataset(BaseStorage):
     """Represents an append-only structured storage, ideal for tabular data similar to database tables.
 
@@ -212,12 +238,12 @@ async def get_data(self, **kwargs: Unpack[GetDataKwargs]) -> DatasetItemsListPag
         # https://github.com/apify/apify-sdk-python/issues/140
         return await self._resource_client.list_items(**kwargs)
 
-    async def write_to(self, content_type: Literal['json', 'csv'], destination: TextIO) -> None:
+    async def write_to_csv(self, destination: TextIO, **kwargs: Unpack[ExportDataCsvKwargs]) -> None:
         """Exports the entire dataset into an arbitrary stream.
 
         Args:
-            content_type: Specifies the output format.
             destination: The stream into which the dataset contents should be written.
+            kwargs: Additional keyword arguments for `csv.writer`.
         """
         items: list[dict] = []
         limit = 1000
@@ -230,16 +256,33 @@ async def write_to(self, content_type: Literal['json', 'csv'], destination: Text
                 break
             offset += list_items.count
 
-        if content_type == 'csv':
-            if items:
-                writer = csv.writer(destination, quoting=csv.QUOTE_MINIMAL)
-                writer.writerows([items[0].keys(), *[item.values() for item in items]])
-            else:
-                logger.warning('Attempting to export an empty dataset - no file will be created')
-        elif content_type == 'json':
-            json.dump(items, destination)
+        if items:
+            writer = csv.writer(destination, **kwargs)
+            writer.writerows([items[0].keys(), *[item.values() for item in items]])
+        else:
+            logger.warning('Attempting to export an empty dataset - no file will be created')
+
+    async def write_to_json(self, destination: TextIO, **kwargs: Unpack[ExportDataJsonKwargs]) -> None:
+        """Exports the entire dataset into an arbitrary stream.
+        Args:
+            destination: The stream into which the dataset contents should be written.
+            kwargs: Additional keyword arguments for `json.dump`.
+        """
+        items: list[dict] = []
+        limit = 1000
+        offset = 0
+
+        while True:
+            list_items = await self._resource_client.list_items(limit=limit, offset=offset)
+            items.extend(list_items.items)
+            if list_items.total <= offset + list_items.count:
+                break
+            offset += list_items.count
+
+        if items:
+            json.dump(items, destination, **kwargs)
         else:
-            raise ValueError(f'Unsupported content type: {content_type}')
+            logger.warning('Attempting to export an empty dataset - no file will be created')
 
     async def export_to(self, **kwargs: Unpack[ExportToKwargs]) -> None:
         """Exports the entire dataset into a specified file stored under a key in a key-value store.
@@ -260,7 +303,12 @@ async def export_to(self, **kwargs: Unpack[ExportToKwargs]) -> None:
         key_value_store = await KeyValueStore.open(id=to_key_value_store_id, name=to_key_value_store_name)
 
         output = io.StringIO()
-        await self.write_to(content_type, output)
+        if content_type == 'csv':
+            await self.write_to_csv(output)
+        elif content_type == 'json':
+            await self.write_to_json(output)
+        else:
+            raise ValueError('Unsupported content type, expecting CSV or JSON')
 
         if content_type == 'csv':
             await key_value_store.set_value(key, output.getvalue(), 'text/csv')

diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py
@@ -585,8 +585,8 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
     await dataset.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
     await dataset.push_data({'id': 2, 'test': 'test'})
 
-    await crawler.export_data(tmp_path / 'dataset.json')
-    await crawler.export_data(tmp_path / 'dataset.csv')
+    await crawler.export_data_json(path=tmp_path / 'dataset.json')
+    await crawler.export_data_csv(path=tmp_path / 'dataset.csv')
 
     assert json.load((tmp_path / 'dataset.json').open()) == [
         {'id': 0, 'test': 'test'},
@@ -618,6 +618,33 @@ async def handler(context: BasicCrawlingContext) -> None:
     assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'
 
 
+async def test_crawler_push_and_export_data_and_json_dump_parameter(httpbin: str, tmp_path: Path) -> None:
+    crawler = BasicCrawler()
+
+    @crawler.router.default_handler
+    async def handler(context: BasicCrawlingContext) -> None:
+        await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
+        await context.push_data({'id': 2, 'test': 'test'})
+
+    await crawler.run([f'{httpbin}/1'])
+
+    await crawler.export_data_json(path=tmp_path / 'dataset.json', indent=3)
+
+    with (tmp_path / 'dataset.json').open() as json_file:
+        exported_json_str = json_file.read()
+
+    # Expected data in JSON format with 3 spaces indent
+    expected_data = [
+        {'id': 0, 'test': 'test'},
+        {'id': 1, 'test': 'test'},
+        {'id': 2, 'test': 'test'},
+    ]
+    expected_json_str = json.dumps(expected_data, indent=3)
+
+    # Assert that the exported JSON string matches the expected JSON string
+    assert exported_json_str == expected_json_str
+
+
 async def test_context_update_kv_store() -> None:
     crawler = BasicCrawler()