From 471cb6465c127c24cc357852f03f3ba2eccd847f Mon Sep 17 00:00:00 2001 From: Rutam21 Date: Fri, 25 Oct 2024 18:27:38 +0530 Subject: [PATCH] feat: Pass extra configuration to json.dump() --- .../code/export_entire_dataset_to_file_csv.py | 2 +- .../export_entire_dataset_to_file_json.py | 2 +- docs/examples/code/parsel_crawler.py | 2 +- src/crawlee/basic_crawler/_basic_crawler.py | 12 +++- src/crawlee/storages/_dataset.py | 72 +++++++++++++++---- .../unit/basic_crawler/test_basic_crawler.py | 31 +++++++- 6 files changed, 102 insertions(+), 19 deletions(-) diff --git a/docs/examples/code/export_entire_dataset_to_file_csv.py b/docs/examples/code/export_entire_dataset_to_file_csv.py index f8bdff717..91bcc2dc5 100644 --- a/docs/examples/code/export_entire_dataset_to_file_csv.py +++ b/docs/examples/code/export_entire_dataset_to_file_csv.py @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(['https://crawlee.dev']) # Export the entire dataset to a CSV file. - await crawler.export_data('results.csv') + await crawler.export_data_csv(path='results.csv') if __name__ == '__main__': diff --git a/docs/examples/code/export_entire_dataset_to_file_json.py b/docs/examples/code/export_entire_dataset_to_file_json.py index 53274baf9..7e9085ea2 100644 --- a/docs/examples/code/export_entire_dataset_to_file_json.py +++ b/docs/examples/code/export_entire_dataset_to_file_json.py @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: await crawler.run(['https://crawlee.dev']) # Export the entire dataset to a JSON file. - await crawler.export_data('results.json') + await crawler.export_data_json(path='results.json') if __name__ == '__main__': diff --git a/docs/examples/code/parsel_crawler.py b/docs/examples/code/parsel_crawler.py index 8b6f07810..29d32461e 100644 --- a/docs/examples/code/parsel_crawler.py +++ b/docs/examples/code/parsel_crawler.py @@ -34,7 +34,7 @@ async def request_handler(context: ParselCrawlingContext) -> None: await crawler.run(['https://github.com']) # Export the entire dataset to a JSON file. - await crawler.export_data('results.json') + await crawler.export_data_json(path='results.json') if __name__ == '__main__': diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py index d6d61d4dc..9a6b20c0c 100644 --- a/src/crawlee/basic_crawler/_basic_crawler.py +++ b/src/crawlee/basic_crawler/_basic_crawler.py @@ -55,7 +55,7 @@ from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo from crawlee.sessions import Session from crawlee.statistics import FinalStatistics, StatisticsState - from crawlee.storages._dataset import GetDataKwargs, PushDataKwargs + from crawlee.storages._dataset import GetDataKwargs, PushDataKwargs, ExportDataCsvKwargs, ExportDataJsonKwargs from crawlee.storages._request_provider import RequestProvider TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) @@ -475,10 +475,18 @@ async def get_data( async def export_data( self, path: str | Path, - content_type: Literal['json', 'csv'] | None = None, + *, + content_type: Literal['csv', 'json'] = 'csv', dataset_id: str | None = None, dataset_name: str | None = None, + **kwargs: Any, ) -> None: + if content_type == 'csv': + self.export_data_csv(path, dataset_id=dataset_id, dataset_name=dataset_name, **kwargs) + elif content_type == 'json': + self.export_data_json(path, dataset_id=dataset_id, dataset_name=dataset_name, **kwargs) + else: + raise ValueError(f'Unsupported content type: {content_type}.') """Export data from a dataset. This helper method simplifies the process of exporting data from a dataset. It opens the specified diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index a2fa83f54..8339fb639 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -77,6 +77,32 @@ class ExportToKwargs(TypedDict): """Name of the key-value store to save the exported file.""" +class ExportDataJsonKwargs(TypedDict): + """Keyword arguments for dataset's `export_data_json` method. + Args: + ensure_asci: Whether non-ASCII characters should be escaped in the output JSON string. + indent: Specifies the number of spaces to use for indentation in the pretty-printed JSON output. + sort_keys: Specifies whether the output JSON object should have its keys sorted alphabetically. + """ + + ensure_asci: NotRequired[bool] + indent: NotRequired[int] + sort_keys: NotRequired[bool] + + +class ExportDataCsvKwargs(TypedDict): + """Keyword arguments for dataset's `export_data_csv` method. + Args: + delimiter: A character that separates fields in the CSV file. + quotechar: A character used to enclose fields containing special characters like the delimiter. + quoting: An integer that defines how quotes should be applied. + """ + + delimiter: NotRequired[str] + quotechar: NotRequired[str] + quoting: NotRequired[int] + + class Dataset(BaseStorage): """Represents an append-only structured storage, ideal for tabular data similar to database tables. @@ -209,12 +235,12 @@ async def get_data(self, **kwargs: Unpack[GetDataKwargs]) -> DatasetItemsListPag # https://github.com/apify/apify-sdk-python/issues/140 return await self._resource_client.list_items(**kwargs) - async def write_to(self, content_type: Literal['json', 'csv'], destination: TextIO) -> None: + async def write_to_csv(self, destination: TextIO, **kwargs: Unpack[ExportDataCsvKwargs]) -> None: """Exports the entire dataset into an arbitrary stream. Args: - content_type: Specifies the output format. destination: The stream into which the dataset contents should be written. + kwargs: Additional keyword arguments for `csv.writer`. """ items: list[dict] = [] limit = 1000 @@ -227,16 +253,33 @@ async def write_to(self, content_type: Literal['json', 'csv'], destination: Text break offset += list_items.count - if content_type == 'csv': - if items: - writer = csv.writer(destination, quoting=csv.QUOTE_MINIMAL) - writer.writerows([items[0].keys(), *[item.values() for item in items]]) - else: - logger.warning('Attempting to export an empty dataset - no file will be created') - elif content_type == 'json': - json.dump(items, destination) + if items: + writer = csv.writer(destination, **kwargs) + writer.writerows([items[0].keys(), *[item.values() for item in items]]) + else: + logger.warning('Attempting to export an empty dataset - no file will be created') + + async def write_to_json(self, destination: TextIO, **kwargs: Unpack[ExportDataJsonKwargs]) -> None: + """Exports the entire dataset into an arbitrary stream. + Args: + destination: The stream into which the dataset contents should be written. + kwargs: Additional keyword arguments for `json.dump`. + """ + items: list[dict] = [] + limit = 1000 + offset = 0 + + while True: + list_items = await self._resource_client.list_items(limit=limit, offset=offset) + items.extend(list_items.items) + if list_items.total <= offset + list_items.count: + break + offset += list_items.count + + if items: + json.dump(items, destination, **kwargs) else: - raise ValueError(f'Unsupported content type: {content_type}') + logger.warning('Attempting to export an empty dataset - no file will be created') async def export_to(self, **kwargs: Unpack[ExportToKwargs]) -> None: """Exports the entire dataset into a specified file stored under a key in a key-value store. @@ -257,7 +300,12 @@ async def export_to(self, **kwargs: Unpack[ExportToKwargs]) -> None: key_value_store = await KeyValueStore.open(id=to_key_value_store_id, name=to_key_value_store_name) output = io.StringIO() - await self.write_to(content_type, output) + if content_type == 'csv': + await self.write_to_csv(output) + elif content_type == 'json': + await self.write_to_json(output) + else: + raise ValueError('Unsupported content type, expecting CSV or JSON') if content_type == 'csv': await key_value_store.set_value(key, output.getvalue(), 'text/csv') diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py index d62ef3022..4a2157380 100644 --- a/tests/unit/basic_crawler/test_basic_crawler.py +++ b/tests/unit/basic_crawler/test_basic_crawler.py @@ -585,8 +585,8 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None: await dataset.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}]) await dataset.push_data({'id': 2, 'test': 'test'}) - await crawler.export_data(tmp_path / 'dataset.json') - await crawler.export_data(tmp_path / 'dataset.csv') + await crawler.export_data_json(path=tmp_path / 'dataset.json') + await crawler.export_data_csv(path=tmp_path / 'dataset.csv') assert json.load((tmp_path / 'dataset.json').open()) == [ {'id': 0, 'test': 'test'}, @@ -618,6 +618,33 @@ async def handler(context: BasicCrawlingContext) -> None: assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n' +async def test_crawler_push_and_export_data_and_json_dump_parameter(httpbin: str, tmp_path: Path) -> None: + crawler = BasicCrawler() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}]) + await context.push_data({'id': 2, 'test': 'test'}) + + await crawler.run([f'{httpbin}/1']) + + await crawler.export_data_json(path=tmp_path / 'dataset.json', indent=3) + + with (tmp_path / 'dataset.json').open() as json_file: + exported_json_str = json_file.read() + + # Expected data in JSON format with 3 spaces indent + expected_data = [ + {'id': 0, 'test': 'test'}, + {'id': 1, 'test': 'test'}, + {'id': 2, 'test': 'test'}, + ] + expected_json_str = json.dumps(expected_data, indent=3) + + # Assert that the exported JSON string matches the expected JSON string + assert exported_json_str == expected_json_str + + async def test_context_update_kv_store() -> None: crawler = BasicCrawler()