Skip to content

Commit

Permalink
refactor(dataset): raise if user tries to materialize a dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
gforsyth committed Sep 24, 2024
1 parent 70b0c2e commit 6a0dc25
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 21 deletions.
4 changes: 2 additions & 2 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1609,8 +1609,8 @@ def _in_memory_table_exists(self, name: str) -> bool:
return True

def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
if hasattr(op.data, "to_pyarrow_lazy"):
self.con.register(op.name, op.data.to_pyarrow_lazy(op.schema))
if hasattr(op.data, "to_pyarrow_dataset"):
self.con.register(op.name, op.data.to_pyarrow_dataset(op.schema))
else:
self.con.register(op.name, op.data.to_pyarrow(op.schema))

Expand Down
21 changes: 20 additions & 1 deletion ibis/backends/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,26 @@ def test_self_join_memory_table(backend, con, monkeypatch):
lambda: ds.dataset(pa.table({"a": ["a"], "b": [1]})),
"df_arrow_dataset",
marks=[
pytest.mark.notimpl(["polars"]),
pytest.mark.notimpl(
[
"bigquery",
"clickhouse",
"exasol",
"impala",
"mssql",
"mysql",
"oracle",
"postgres",
"pyspark",
"risingwave",
"snowflake",
"sqlite",
"trino",
],
raises=com.UnsupportedOperationError,
reason="we don't materialize datasets to avoid perf footguns",
),
pytest.mark.notimpl(["polars"], raises=NotImplementedError),
],
id="pyarrow dataset",
),
Expand Down
55 changes: 37 additions & 18 deletions ibis/formats/pyarrow.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from __future__ import annotations

from functools import cached_property
from typing import TYPE_CHECKING, Any

import pyarrow as pa
import pyarrow_hotfix # noqa: F401

import ibis.common.exceptions as com
import ibis.expr.datatypes as dt
from ibis.expr.schema import Schema
from ibis.formats import DataMapper, SchemaMapper, TableProxy, TypeMapper
Expand Down Expand Up @@ -349,8 +349,15 @@ def to_polars(self, schema: Schema) -> pl.DataFrame:


class PyArrowDatasetProxy(TableProxy):
__slots__ = ("obj", "__dict__")
__slots__ = ("obj",)
obj: V
"""You are trying to use a PyArrow Dataset with a backend
that will require materializing the entire dataset in local
memory.
If you would like to materialize this dataset, please construct the
memtable directly by running `ibis.memtable(my_dataset.to_table())`
"""

def __init__(self, obj: V):
self.obj = obj
Expand All @@ -362,30 +369,42 @@ def __len__(self):
def __hash__(self):
return hash(self.obj)

@cached_property
def _cache(self):
return self.obj.to_table()

def to_frame(self) -> pd.DataFrame:
"""Convert this input to a pandas DataFrame."""
return self._cache.to_pandas()
raise com.UnsupportedOperationError(
"""You are trying to use a PyArrow Dataset with a backend
that will require materializing the entire dataset in local
memory.
If you would like to materialize this dataset, please construct the
memtable directly by running `ibis.memtable(my_dataset.to_table())`
"""
)

def to_pyarrow(self, schema: Schema) -> pa.Table:
"""Convert this input to a PyArrow Table."""
return self._cache
raise com.UnsupportedOperationError(
"""You are trying to use a PyArrow Dataset with a backend
that will require materializing the entire dataset in local
memory.
def to_pyarrow_lazy(self, schema: Schema) -> ds.Dataset:
If you would like to materialize this dataset, please construct the
memtable directly by running `ibis.memtable(my_dataset.to_table())`
"""
)

def to_pyarrow_dataset(self, schema: Schema) -> ds.Dataset:
"""Return the dataset object itself.
Use with backends that can perform pushdowns into dataset objects.
"""
return self.obj

def to_polars(self, schema: Schema) -> pl.DataFrame:
"""Convert this input to a Polars DataFrame."""
import polars as pl

from ibis.formats.polars import PolarsData
def to_polars(self, schema: Schema) -> pa.Table:
raise com.UnsupportedOperationError(

Check warning on line 402 in ibis/formats/pyarrow.py

View check run for this annotation

Codecov / codecov/patch

ibis/formats/pyarrow.py#L402

Added line #L402 was not covered by tests
"""You are trying to use a PyArrow Dataset with a backend
that will require materializing the entire dataset in local
memory.
df = pl.from_arrow(self._cache)
return PolarsData.convert_table(df, schema)
If you would like to materialize this dataset, please construct the
memtable directly by running `ibis.memtable(my_dataset.to_table())`
"""
)

0 comments on commit 6a0dc25

Please sign in to comment.