diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index 7b17773631e54..b1f5a1f074532 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -1609,8 +1609,8 @@ def _in_memory_table_exists(self, name: str) -> bool: return True def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: - if hasattr(op.data, "to_pyarrow_lazy"): - self.con.register(op.name, op.data.to_pyarrow_lazy(op.schema)) + if hasattr(op.data, "to_pyarrow_dataset"): + self.con.register(op.name, op.data.to_pyarrow_dataset(op.schema)) else: self.con.register(op.name, op.data.to_pyarrow(op.schema)) diff --git a/ibis/backends/tests/test_client.py b/ibis/backends/tests/test_client.py index 4b0b2d6ab65bd..652592642911a 100644 --- a/ibis/backends/tests/test_client.py +++ b/ibis/backends/tests/test_client.py @@ -929,7 +929,26 @@ def test_self_join_memory_table(backend, con, monkeypatch): lambda: ds.dataset(pa.table({"a": ["a"], "b": [1]})), "df_arrow_dataset", marks=[ - pytest.mark.notimpl(["polars"]), + pytest.mark.notimpl( + [ + "bigquery", + "clickhouse", + "exasol", + "impala", + "mssql", + "mysql", + "oracle", + "postgres", + "pyspark", + "risingwave", + "snowflake", + "sqlite", + "trino", + ], + raises=com.UnsupportedOperationError, + reason="we don't materialize datasets to avoid perf footguns", + ), + pytest.mark.notimpl(["polars"], raises=NotImplementedError), ], id="pyarrow dataset", ), diff --git a/ibis/formats/pyarrow.py b/ibis/formats/pyarrow.py index 53d1e728d9e82..cb968b6a6b16d 100644 --- a/ibis/formats/pyarrow.py +++ b/ibis/formats/pyarrow.py @@ -1,11 +1,11 @@ from __future__ import annotations -from functools import cached_property from typing import TYPE_CHECKING, Any import pyarrow as pa import pyarrow_hotfix # noqa: F401 +import ibis.common.exceptions as com import ibis.expr.datatypes as dt from ibis.expr.schema import Schema from ibis.formats import DataMapper, SchemaMapper, TableProxy, TypeMapper @@ -349,8 +349,15 @@ def to_polars(self, schema: Schema) -> pl.DataFrame: class PyArrowDatasetProxy(TableProxy): - __slots__ = ("obj", "__dict__") + __slots__ = ("obj",) obj: V + """You are trying to use a PyArrow Dataset with a backend + that will require materializing the entire dataset in local + memory. + + If you would like to materialize this dataset, please construct the + memtable directly by running `ibis.memtable(my_dataset.to_table())` + """ def __init__(self, obj: V): self.obj = obj @@ -362,30 +369,42 @@ def __len__(self): def __hash__(self): return hash(self.obj) - @cached_property - def _cache(self): - return self.obj.to_table() - def to_frame(self) -> pd.DataFrame: - """Convert this input to a pandas DataFrame.""" - return self._cache.to_pandas() + raise com.UnsupportedOperationError( + """You are trying to use a PyArrow Dataset with a backend + that will require materializing the entire dataset in local + memory. + + If you would like to materialize this dataset, please construct the + memtable directly by running `ibis.memtable(my_dataset.to_table())` + """ + ) def to_pyarrow(self, schema: Schema) -> pa.Table: - """Convert this input to a PyArrow Table.""" - return self._cache + raise com.UnsupportedOperationError( + """You are trying to use a PyArrow Dataset with a backend + that will require materializing the entire dataset in local + memory. - def to_pyarrow_lazy(self, schema: Schema) -> ds.Dataset: + If you would like to materialize this dataset, please construct the + memtable directly by running `ibis.memtable(my_dataset.to_table())` + """ + ) + + def to_pyarrow_dataset(self, schema: Schema) -> ds.Dataset: """Return the dataset object itself. Use with backends that can perform pushdowns into dataset objects. """ return self.obj - def to_polars(self, schema: Schema) -> pl.DataFrame: - """Convert this input to a Polars DataFrame.""" - import polars as pl - - from ibis.formats.polars import PolarsData + def to_polars(self, schema: Schema) -> pa.Table: + raise com.UnsupportedOperationError( + """You are trying to use a PyArrow Dataset with a backend + that will require materializing the entire dataset in local + memory. - df = pl.from_arrow(self._cache) - return PolarsData.convert_table(df, schema) + If you would like to materialize this dataset, please construct the + memtable directly by running `ibis.memtable(my_dataset.to_table())` + """ + )