Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Pass extra configuration to json.dump() #622

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/examples/code/export_entire_dataset_to_file_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a CSV file.
await crawler.export_data('results.csv')
await crawler.export_data_csv(path='results.csv')


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code/export_entire_dataset_to_file_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await crawler.run(['https://crawlee.dev'])

# Export the entire dataset to a JSON file.
await crawler.export_data('results.json')
await crawler.export_data_json(path='results.json')


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code/parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:
await crawler.run(['https://github.com'])

# Export the entire dataset to a JSON file.
await crawler.export_data('results.json')
await crawler.export_data_json(path='results.json')


if __name__ == '__main__':
Expand Down
12 changes: 10 additions & 2 deletions src/crawlee/basic_crawler/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
from crawlee.sessions import Session
from crawlee.statistics import FinalStatistics, StatisticsState
from crawlee.storages._dataset import GetDataKwargs, PushDataKwargs
from crawlee.storages._dataset import GetDataKwargs, PushDataKwargs, ExportDataCsvKwargs, ExportDataJsonKwargs
from crawlee.storages._request_provider import RequestProvider

TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
Expand Down Expand Up @@ -488,10 +488,18 @@ async def get_data(
async def export_data(
self,
path: str | Path,
content_type: Literal['json', 'csv'] | None = None,
*,
content_type: Literal['csv', 'json'] = 'csv',
dataset_id: str | None = None,
dataset_name: str | None = None,
**kwargs: Any,
) -> None:
if content_type == 'csv':
self.export_data_csv(path, dataset_id=dataset_id, dataset_name=dataset_name, **kwargs)
elif content_type == 'json':
self.export_data_json(path, dataset_id=dataset_id, dataset_name=dataset_name, **kwargs)
else:
raise ValueError(f'Unsupported content type: {content_type}.')
"""Export data from a dataset.

This helper method simplifies the process of exporting data from a dataset. It opens the specified
Expand Down
72 changes: 60 additions & 12 deletions src/crawlee/storages/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,32 @@ class ExportToKwargs(TypedDict):
"""Name of the key-value store to save the exported file."""


class ExportDataJsonKwargs(TypedDict):
"""Keyword arguments for dataset's `export_data_json` method.
Args:
ensure_asci: Whether non-ASCII characters should be escaped in the output JSON string.
indent: Specifies the number of spaces to use for indentation in the pretty-printed JSON output.
sort_keys: Specifies whether the output JSON object should have its keys sorted alphabetically.
"""

ensure_asci: NotRequired[bool]
indent: NotRequired[int]
sort_keys: NotRequired[bool]


class ExportDataCsvKwargs(TypedDict):
"""Keyword arguments for dataset's `export_data_csv` method.
Args:
delimiter: A character that separates fields in the CSV file.
quotechar: A character used to enclose fields containing special characters like the delimiter.
quoting: An integer that defines how quotes should be applied.
"""

delimiter: NotRequired[str]
quotechar: NotRequired[str]
quoting: NotRequired[int]


class Dataset(BaseStorage):
"""Represents an append-only structured storage, ideal for tabular data similar to database tables.

Expand Down Expand Up @@ -212,12 +238,12 @@ async def get_data(self, **kwargs: Unpack[GetDataKwargs]) -> DatasetItemsListPag
# https://github.com/apify/apify-sdk-python/issues/140
return await self._resource_client.list_items(**kwargs)

async def write_to(self, content_type: Literal['json', 'csv'], destination: TextIO) -> None:
async def write_to_csv(self, destination: TextIO, **kwargs: Unpack[ExportDataCsvKwargs]) -> None:
"""Exports the entire dataset into an arbitrary stream.

Args:
content_type: Specifies the output format.
destination: The stream into which the dataset contents should be written.
kwargs: Additional keyword arguments for `csv.writer`.
"""
items: list[dict] = []
limit = 1000
Expand All @@ -230,16 +256,33 @@ async def write_to(self, content_type: Literal['json', 'csv'], destination: Text
break
offset += list_items.count

if content_type == 'csv':
if items:
writer = csv.writer(destination, quoting=csv.QUOTE_MINIMAL)
writer.writerows([items[0].keys(), *[item.values() for item in items]])
else:
logger.warning('Attempting to export an empty dataset - no file will be created')
elif content_type == 'json':
json.dump(items, destination)
if items:
writer = csv.writer(destination, **kwargs)
writer.writerows([items[0].keys(), *[item.values() for item in items]])
else:
logger.warning('Attempting to export an empty dataset - no file will be created')

async def write_to_json(self, destination: TextIO, **kwargs: Unpack[ExportDataJsonKwargs]) -> None:
"""Exports the entire dataset into an arbitrary stream.
Args:
destination: The stream into which the dataset contents should be written.
kwargs: Additional keyword arguments for `json.dump`.
"""
items: list[dict] = []
limit = 1000
offset = 0

while True:
list_items = await self._resource_client.list_items(limit=limit, offset=offset)
items.extend(list_items.items)
if list_items.total <= offset + list_items.count:
break
offset += list_items.count

if items:
json.dump(items, destination, **kwargs)
else:
raise ValueError(f'Unsupported content type: {content_type}')
logger.warning('Attempting to export an empty dataset - no file will be created')

async def export_to(self, **kwargs: Unpack[ExportToKwargs]) -> None:
"""Exports the entire dataset into a specified file stored under a key in a key-value store.
Expand All @@ -260,7 +303,12 @@ async def export_to(self, **kwargs: Unpack[ExportToKwargs]) -> None:
key_value_store = await KeyValueStore.open(id=to_key_value_store_id, name=to_key_value_store_name)

output = io.StringIO()
await self.write_to(content_type, output)
if content_type == 'csv':
await self.write_to_csv(output)
elif content_type == 'json':
await self.write_to_json(output)
else:
raise ValueError('Unsupported content type, expecting CSV or JSON')

if content_type == 'csv':
await key_value_store.set_value(key, output.getvalue(), 'text/csv')
Expand Down
31 changes: 29 additions & 2 deletions tests/unit/basic_crawler/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,8 +585,8 @@ async def test_crawler_push_and_export_data(tmp_path: Path) -> None:
await dataset.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
await dataset.push_data({'id': 2, 'test': 'test'})

await crawler.export_data(tmp_path / 'dataset.json')
await crawler.export_data(tmp_path / 'dataset.csv')
await crawler.export_data_json(path=tmp_path / 'dataset.json')
await crawler.export_data_csv(path=tmp_path / 'dataset.csv')

assert json.load((tmp_path / 'dataset.json').open()) == [
{'id': 0, 'test': 'test'},
Expand Down Expand Up @@ -618,6 +618,33 @@ async def handler(context: BasicCrawlingContext) -> None:
assert (tmp_path / 'dataset.csv').read_bytes() == b'id,test\r\n0,test\r\n1,test\r\n2,test\r\n'


async def test_crawler_push_and_export_data_and_json_dump_parameter(httpbin: str, tmp_path: Path) -> None:
crawler = BasicCrawler()

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
await context.push_data([{'id': 0, 'test': 'test'}, {'id': 1, 'test': 'test'}])
await context.push_data({'id': 2, 'test': 'test'})

await crawler.run([f'{httpbin}/1'])

await crawler.export_data_json(path=tmp_path / 'dataset.json', indent=3)

with (tmp_path / 'dataset.json').open() as json_file:
exported_json_str = json_file.read()

# Expected data in JSON format with 3 spaces indent
expected_data = [
{'id': 0, 'test': 'test'},
{'id': 1, 'test': 'test'},
{'id': 2, 'test': 'test'},
]
expected_json_str = json.dumps(expected_data, indent=3)

# Assert that the exported JSON string matches the expected JSON string
assert exported_json_str == expected_json_str


async def test_context_update_kv_store() -> None:
crawler = BasicCrawler()

Expand Down
Loading