Skip to content

Commit db1bc62

Browse files
authored
Chore/upgrade datafusion 44 (#973)
* Bump DataFusion version to 44 * Trait definition for plan properties now returns LexOrdering * find_df_window_func was removed upstream * Prepare and Execute variants were removed from LogicalPlan * Substrait functions now take SessionState instead of SessionContext * Remove unused import * RuntimeConfig is now deprecated * Switch from RuntimeConfig to RuntimeEnvBuilder * Update return types on unit tests * DF 44 changes the execution plan properties to have boundedness and emission type * Initcap now returns stringview * Bump datafusion version in example
1 parent 4b262be commit db1bc62

File tree

19 files changed

+529
-487
lines changed

19 files changed

+529
-487
lines changed

Cargo.lock

Lines changed: 408 additions & 375 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,10 @@ tokio = { version = "1.41", features = ["macros", "rt", "rt-multi-thread", "sync
3838
pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] }
3939
pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]}
4040
arrow = { version = "53", features = ["pyarrow"] }
41-
datafusion = { version = "43.0.0", features = ["pyarrow", "avro", "unicode_expressions"] }
42-
datafusion-substrait = { version = "43.0.0", optional = true }
43-
datafusion-proto = { version = "43.0.0" }
44-
datafusion-ffi = { version = "43.0.0" }
45-
datafusion-functions-window-common = { version = "43.0.0" }
41+
datafusion = { version = "44.0.0", features = ["pyarrow", "avro", "unicode_expressions"] }
42+
datafusion-substrait = { version = "44.0.0", optional = true }
43+
datafusion-proto = { version = "44.0.0" }
44+
datafusion-ffi = { version = "44.0.0" }
4645
prost = "0.13" # keep in line with `datafusion-substrait`
4746
uuid = { version = "1.11", features = ["v4"] }
4847
mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] }

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ It is possible to configure runtime (memory and disk settings) and configuration
8787

8888
```python
8989
runtime = (
90-
RuntimeConfig()
90+
RuntimeEnvBuilder()
9191
.with_disk_manager_os()
9292
.with_fair_spill_pool(10000000)
9393
)

benchmarks/db-benchmark/groupby-datafusion.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from datafusion import (
2323
col,
2424
functions as f,
25-
RuntimeConfig,
25+
RuntimeEnvBuilder,
2626
SessionConfig,
2727
SessionContext,
2828
)
@@ -85,7 +85,9 @@ def execute(df):
8585

8686
# create a session context with explicit runtime and config settings
8787
runtime = (
88-
RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(64 * 1024 * 1024 * 1024)
88+
RuntimeEnvBuilder()
89+
.with_disk_manager_os()
90+
.with_fair_spill_pool(64 * 1024 * 1024 * 1024)
8991
)
9092
config = (
9193
SessionConfig()

benchmarks/tpch/tpch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def bench(data_path, query_path):
2828

2929
# create context
3030
# runtime = (
31-
# RuntimeConfig()
31+
# RuntimeEnvBuilder()
3232
# .with_disk_manager_os()
3333
# .with_fair_spill_pool(10000000)
3434
# )

docs/source/user-guide/configuration.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,18 @@ Configuration
1919
=============
2020

2121
Let's look at how we can configure DataFusion. When creating a :py:class:`~datafusion.context.SessionContext`, you can pass in
22-
a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeConfig` object. These two cover a wide range of options.
22+
a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeEnvBuilder` object. These two cover a wide range of options.
2323

2424
.. code-block:: python
2525
26-
from datafusion import RuntimeConfig, SessionConfig, SessionContext
26+
from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext
2727
2828
# create a session context with default settings
2929
ctx = SessionContext()
3030
print(ctx)
3131
3232
# create a session context with explicit runtime and config settings
33-
runtime = RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(10000000)
33+
runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000)
3434
config = (
3535
SessionConfig()
3636
.with_create_default_catalog_and_schema(True)
@@ -48,4 +48,4 @@ a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.conte
4848
4949
5050
You can read more about available :py:class:`~datafusion.context.SessionConfig` options in the `rust DataFusion Configuration guide <https://arrow.apache.org/datafusion/user-guide/configs.html>`_,
51-
and about :code:`RuntimeConfig` options in the rust `online API documentation <https://docs.rs/datafusion/latest/datafusion/execution/runtime_env/struct.RuntimeConfig.html>`_.
51+
and about :code:`RuntimeEnvBuilder` options in the rust `online API documentation <https://docs.rs/datafusion/latest/datafusion/execution/runtime_env/struct.RuntimeEnvBuilder.html>`_.

examples/create-context.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
from datafusion import RuntimeConfig, SessionConfig, SessionContext
18+
from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext
1919

2020
# create a session context with default settings
2121
ctx = SessionContext()
2222
print(ctx)
2323

2424
# create a session context with explicit runtime and config settings
25-
runtime = RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(10000000)
25+
runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000)
2626
config = (
2727
SessionConfig()
2828
.with_create_default_catalog_and_schema(True)

examples/ffi-table-provider/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ version = "0.1.0"
2121
edition = "2021"
2222

2323
[dependencies]
24-
datafusion = { version = "43.0.0" }
25-
datafusion-ffi = { version = "43.0.0" }
24+
datafusion = { version = "44.0.0" }
25+
datafusion-ffi = { version = "44.0.0" }
2626
pyo3 = { version = "0.22.6", features = ["extension-module", "abi3", "abi3-py38"] }
2727
arrow = { version = "53.2.0" }
2828
arrow-array = { version = "53.2.0" }

python/datafusion/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from .context import (
3030
SessionContext,
3131
SessionConfig,
32-
RuntimeConfig,
32+
RuntimeEnvBuilder,
3333
SQLOptions,
3434
)
3535

@@ -66,7 +66,7 @@
6666
"SessionContext",
6767
"SessionConfig",
6868
"SQLOptions",
69-
"RuntimeConfig",
69+
"RuntimeEnvBuilder",
7070
"Expr",
7171
"ScalarUDF",
7272
"WindowFrame",

python/datafusion/context.py

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from __future__ import annotations
2121

2222
from ._internal import SessionConfig as SessionConfigInternal
23-
from ._internal import RuntimeConfig as RuntimeConfigInternal
23+
from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal
2424
from ._internal import SQLOptions as SQLOptionsInternal
2525
from ._internal import SessionContext as SessionContextInternal
2626

@@ -265,56 +265,58 @@ def set(self, key: str, value: str) -> SessionConfig:
265265
return self
266266

267267

268-
class RuntimeConfig:
268+
class RuntimeEnvBuilder:
269269
"""Runtime configuration options."""
270270

271271
def __init__(self) -> None:
272-
"""Create a new :py:class:`RuntimeConfig` with default values."""
273-
self.config_internal = RuntimeConfigInternal()
272+
"""Create a new :py:class:`RuntimeEnvBuilder` with default values."""
273+
self.config_internal = RuntimeEnvBuilderInternal()
274274

275-
def with_disk_manager_disabled(self) -> RuntimeConfig:
275+
def with_disk_manager_disabled(self) -> RuntimeEnvBuilder:
276276
"""Disable the disk manager, attempts to create temporary files will error.
277277
278278
Returns:
279-
A new :py:class:`RuntimeConfig` object with the updated setting.
279+
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
280280
"""
281281
self.config_internal = self.config_internal.with_disk_manager_disabled()
282282
return self
283283

284-
def with_disk_manager_os(self) -> RuntimeConfig:
284+
def with_disk_manager_os(self) -> RuntimeEnvBuilder:
285285
"""Use the operating system's temporary directory for disk manager.
286286
287287
Returns:
288-
A new :py:class:`RuntimeConfig` object with the updated setting.
288+
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
289289
"""
290290
self.config_internal = self.config_internal.with_disk_manager_os()
291291
return self
292292

293-
def with_disk_manager_specified(self, *paths: str | pathlib.Path) -> RuntimeConfig:
293+
def with_disk_manager_specified(
294+
self, *paths: str | pathlib.Path
295+
) -> RuntimeEnvBuilder:
294296
"""Use the specified paths for the disk manager's temporary files.
295297
296298
Args:
297299
paths: Paths to use for the disk manager's temporary files.
298300
299301
Returns:
300-
A new :py:class:`RuntimeConfig` object with the updated setting.
302+
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
301303
"""
302304
paths_list = [str(p) for p in paths]
303305
self.config_internal = self.config_internal.with_disk_manager_specified(
304306
paths_list
305307
)
306308
return self
307309

308-
def with_unbounded_memory_pool(self) -> RuntimeConfig:
310+
def with_unbounded_memory_pool(self) -> RuntimeEnvBuilder:
309311
"""Use an unbounded memory pool.
310312
311313
Returns:
312-
A new :py:class:`RuntimeConfig` object with the updated setting.
314+
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
313315
"""
314316
self.config_internal = self.config_internal.with_unbounded_memory_pool()
315317
return self
316318

317-
def with_fair_spill_pool(self, size: int) -> RuntimeConfig:
319+
def with_fair_spill_pool(self, size: int) -> RuntimeEnvBuilder:
318320
"""Use a fair spill pool with the specified size.
319321
320322
This pool works best when you know beforehand the query has multiple spillable
@@ -335,16 +337,16 @@ def with_fair_spill_pool(self, size: int) -> RuntimeConfig:
335337
size: Size of the memory pool in bytes.
336338
337339
Returns:
338-
A new :py:class:`RuntimeConfig` object with the updated setting.
340+
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
339341
340342
Examples usage::
341343
342-
config = RuntimeConfig().with_fair_spill_pool(1024)
344+
config = RuntimeEnvBuilder().with_fair_spill_pool(1024)
343345
"""
344346
self.config_internal = self.config_internal.with_fair_spill_pool(size)
345347
return self
346348

347-
def with_greedy_memory_pool(self, size: int) -> RuntimeConfig:
349+
def with_greedy_memory_pool(self, size: int) -> RuntimeEnvBuilder:
348350
"""Use a greedy memory pool with the specified size.
349351
350352
This pool works well for queries that do not need to spill or have a single
@@ -355,32 +357,39 @@ def with_greedy_memory_pool(self, size: int) -> RuntimeConfig:
355357
size: Size of the memory pool in bytes.
356358
357359
Returns:
358-
A new :py:class:`RuntimeConfig` object with the updated setting.
360+
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
359361
360362
Example usage::
361363
362-
config = RuntimeConfig().with_greedy_memory_pool(1024)
364+
config = RuntimeEnvBuilder().with_greedy_memory_pool(1024)
363365
"""
364366
self.config_internal = self.config_internal.with_greedy_memory_pool(size)
365367
return self
366368

367-
def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeConfig:
369+
def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder:
368370
"""Use the specified path to create any needed temporary files.
369371
370372
Args:
371373
path: Path to use for temporary files.
372374
373375
Returns:
374-
A new :py:class:`RuntimeConfig` object with the updated setting.
376+
A new :py:class:`RuntimeEnvBuilder` object with the updated setting.
375377
376378
Example usage::
377379
378-
config = RuntimeConfig().with_temp_file_path("/tmp")
380+
config = RuntimeEnvBuilder().with_temp_file_path("/tmp")
379381
"""
380382
self.config_internal = self.config_internal.with_temp_file_path(str(path))
381383
return self
382384

383385

386+
@deprecated("Use `RuntimeEnvBuilder` instead.")
387+
class RuntimeConfig(RuntimeEnvBuilder):
388+
"""See `RuntimeEnvBuilder`."""
389+
390+
pass
391+
392+
384393
class SQLOptions:
385394
"""Options to be used when performing SQL queries."""
386395

@@ -454,7 +463,9 @@ class SessionContext:
454463
"""
455464

456465
def __init__(
457-
self, config: SessionConfig | None = None, runtime: RuntimeConfig | None = None
466+
self,
467+
config: SessionConfig | None = None,
468+
runtime: RuntimeEnvBuilder | None = None,
458469
) -> None:
459470
"""Main interface for executing queries with DataFusion.
460471

python/tests/test_context.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from datafusion import (
2727
DataFrame,
28-
RuntimeConfig,
28+
RuntimeEnvBuilder,
2929
SessionConfig,
3030
SessionContext,
3131
SQLOptions,
@@ -43,7 +43,7 @@ def test_create_context_session_config_only():
4343

4444

4545
def test_create_context_runtime_config_only():
46-
SessionContext(runtime=RuntimeConfig())
46+
SessionContext(runtime=RuntimeEnvBuilder())
4747

4848

4949
@pytest.mark.parametrize("path_to_str", (True, False))
@@ -54,7 +54,7 @@ def test_runtime_configs(tmp_path, path_to_str):
5454
path1 = str(path1) if path_to_str else path1
5555
path2 = str(path2) if path_to_str else path2
5656

57-
runtime = RuntimeConfig().with_disk_manager_specified(path1, path2)
57+
runtime = RuntimeEnvBuilder().with_disk_manager_specified(path1, path2)
5858
config = SessionConfig().with_default_catalog_and_schema("foo", "bar")
5959
ctx = SessionContext(config, runtime)
6060
assert ctx is not None
@@ -67,7 +67,7 @@ def test_runtime_configs(tmp_path, path_to_str):
6767
def test_temporary_files(tmp_path, path_to_str):
6868
path = str(tmp_path) if path_to_str else tmp_path
6969

70-
runtime = RuntimeConfig().with_temp_file_path(path)
70+
runtime = RuntimeEnvBuilder().with_temp_file_path(path)
7171
config = SessionConfig().with_default_catalog_and_schema("foo", "bar")
7272
ctx = SessionContext(config, runtime)
7373
assert ctx is not None
@@ -77,7 +77,7 @@ def test_temporary_files(tmp_path, path_to_str):
7777

7878

7979
def test_create_context_with_all_valid_args():
80-
runtime = RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(10000000)
80+
runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000)
8181
config = (
8282
SessionConfig()
8383
.with_create_default_catalog_and_schema(True)

python/tests/test_functions.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,11 @@ def test_lit_arith(df):
103103
result = df.collect()
104104
assert len(result) == 1
105105
result = result[0]
106+
106107
assert result.column(0) == pa.array([5, 6, 7])
107-
assert result.column(1) == pa.array(["Hello!", "World!", "!!"])
108+
assert result.column(1) == pa.array(
109+
["Hello!", "World!", "!!"], type=pa.string_view()
110+
)
108111

109112

110113
def test_math_functions():
@@ -661,9 +664,12 @@ def test_array_function_obj_tests(stmt, py_expr):
661664
),
662665
(
663666
f.concat(column("a").cast(pa.string()), literal("?")),
664-
pa.array(["Hello?", "World?", "!?"]),
667+
pa.array(["Hello?", "World?", "!?"], type=pa.string_view()),
668+
),
669+
(
670+
f.initcap(column("c")),
671+
pa.array(["Hello ", " World ", " !"], type=pa.string_view()),
665672
),
666-
(f.initcap(column("c")), pa.array(["Hello ", " World ", " !"])),
667673
(f.left(column("a"), literal(3)), pa.array(["Hel", "Wor", "!"])),
668674
(f.length(column("c")), pa.array([6, 7, 2], type=pa.int32())),
669675
(f.lower(column("a")), pa.array(["hello", "world", "!"])),
@@ -871,8 +877,8 @@ def test_temporal_functions(df):
871877
result = df.collect()
872878
assert len(result) == 1
873879
result = result[0]
874-
assert result.column(0) == pa.array([12, 6, 7], type=pa.float64())
875-
assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.float64())
880+
assert result.column(0) == pa.array([12, 6, 7], type=pa.int32())
881+
assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.int32())
876882
assert result.column(2) == pa.array(
877883
[datetime(2022, 12, 1), datetime(2027, 6, 1), datetime(2020, 7, 1)],
878884
type=pa.timestamp("us"),
@@ -904,7 +910,7 @@ def test_temporal_functions(df):
904910
assert result.column(9) == pa.array(
905911
[datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us")
906912
)
907-
assert result.column(10) == pa.array([31, 26, 2], type=pa.float64())
913+
assert result.column(10) == pa.array([31, 26, 2], type=pa.int32())
908914

909915

910916
def test_arrow_cast(df):

0 commit comments

Comments
 (0)