feat: Implement partial "lazy" support for DuckDB (even with this PR,…

… DuckDB support is work-in-progress!) (#1725)
narwhals-dev · Jan 6, 2025 · aa48faa · aa48faa
1 parent e56f91d
commit aa48faa
Show file tree

Hide file tree

Showing 81 changed files with 2,064 additions and 217 deletions.
diff --git a/README.md b/README.md
@@ -14,8 +14,7 @@
 Extremely lightweight and extensible compatibility layer between dataframe libraries!
 
 - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
-- **Lazy-only support**: Dask
-- **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol
+- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark.
 
 Seamlessly support all, without depending on any!
 

diff --git a/docs/backcompat.md b/docs/backcompat.md
@@ -111,6 +111,10 @@ before making any change.
 
 ### After `stable.v1`
 
+
+- Since Narwhals 1.21, passing a `DuckDBPyRelation` to `from_native` returns a `LazyFrame`. In
+  `narwhals.stable.v1`, it returns a `DataFrame` with `level='interchange'`.
+
 - Since Narwhals 1.15, `Series` is generic in the native Series, meaning that you can
   write:
   ```python

diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md
@@ -14,6 +14,7 @@ To illustrate, we create dataframes in various formats:
 ```python exec="1" source="above" session="conversion"
 import narwhals as nw
 from narwhals.typing import IntoDataFrame
+from typing import Any
 
 import duckdb
 import polars as pl
@@ -45,11 +46,15 @@ print(df_to_pandas(df_polars))
 
 ### Via PyCapsule Interface
 
-Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals.
+Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe
+which implements `__arrow_c_stream__`:
 
 ```python exec="1" source="above" session="conversion" result="python"
-def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
-    return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native()
+def df_to_polars(df_native: Any) -> pl.DataFrame:
+    if hasattr(df_native, "__arrow_c_stream__"):
+        return nw.from_arrow(df_native, native_namespace=pl).to_native()
+    msg = f"Expected object which implements '__arrow_c_stream__' got: {type(df)}"
+    raise TypeError(msg)
 
 
 print(df_to_polars(df_duckdb))  # You can only execute this line of code once.
@@ -66,8 +71,9 @@ If you need to ingest the same dataframe multiple times, then you may want to go
 This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving:
 
 ```python exec="1" source="above" session="conversion" result="python"
-def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
-    return pl.DataFrame(nw.from_native(df).to_arrow())
+def df_to_polars(df_native: IntoDataFrame) -> pl.DataFrame:
+    df = nw.from_native(df_native).lazy().collect()
+    return pl.DataFrame(nw.from_native(df, eager_only=True).to_arrow())
 
 
 df_duckdb = duckdb.sql("SELECT * FROM df_polars")

diff --git a/docs/extending.md b/docs/extending.md
@@ -15,17 +15,16 @@ Currently, Narwhals has **full API** support for the following libraries:
 It also has **lazy-only** support for [Dask](https://github.com/dask/dask), and **interchange** support
 for [DuckDB](https://github.com/duckdb/duckdb) and [Ibis](https://github.com/ibis-project/ibis).
 
+We are working towards full "lazy-only" support for DuckDB, Ibis, and PySpark.
+
 ### Levels of support
 
 Narwhals comes with three levels of support:
 
 - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
-- **Lazy-only support**: Dask
+- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark.
 - **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol
 
-The lazy-only layer is a major item on our 2025 roadmap, and hope to be able to bring libraries currently in
-the "interchange" level into that one.
-
 Libraries for which we have full support can benefit from the whole
 [Narwhals API](./api-reference/index.md).
 

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -16,6 +16,7 @@
 from narwhals._arrow.utils import validate_dataframe_comparand
 from narwhals._expression_parsing import evaluate_into_exprs
 from narwhals.dependencies import is_numpy_array
+from narwhals.exceptions import ColumnNotFoundError
 from narwhals.utils import Implementation
 from narwhals.utils import flatten
 from narwhals.utils import generate_temporary_column_name
@@ -669,6 +670,9 @@ def unique(
         import pyarrow.compute as pc
 
         df = self._native_frame
+        if subset is not None and any(x not in self.columns for x in subset):
+            msg = f"Column(s) {subset} not found in {self.columns}"
+            raise ColumnNotFoundError(msg)
         subset = subset or self.columns
 
         if keep in {"any", "first", "last"}:

diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py
@@ -11,6 +11,7 @@
 from narwhals._dask.utils import parse_exprs_and_named_exprs
 from narwhals._pandas_like.utils import native_to_narwhals_dtype
 from narwhals._pandas_like.utils import select_columns_by_name
+from narwhals.exceptions import ColumnNotFoundError
 from narwhals.typing import CompliantLazyFrame
 from narwhals.utils import Implementation
 from narwhals.utils import flatten
@@ -197,6 +198,9 @@ def unique(
         *,
         keep: Literal["any", "none"] = "any",
     ) -> Self:
+        if subset is not None and any(x not in self.columns for x in subset):
+            msg = f"Column(s) {subset} not found in {self.columns}"
+            raise ColumnNotFoundError(msg)
         native_frame = self._native_frame
         if keep == "none":
             subset = subset or self.columns