Skip to content

Commit dee704c

Browse files
committed
Arrow output format for read operations
Detailed release notes to be written in PR description
1 parent 8e6763a commit dee704c

File tree

13 files changed

+2455
-135
lines changed

13 files changed

+2455
-135
lines changed

docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb

Lines changed: 2297 additions & 0 deletions
Large diffs are not rendered by default.

python/arcticdb/arctic.py

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,14 @@ def __init__(
8181
Can be overridden by specifying the encoding version in the LibraryOptions argument to create_library.
8282
8383
output_format: Union[OutputFormat, str], default = OutputFormat.PANDAS
84-
Controls the default output format of all operations returning a dataframe.
85-
The default behavior (OutputFormat.PANDAS) is to return `pandas.DataFrame`s or `pandas.Series` backed by
86-
numpy arrays.
87-
OutputFormat.EXPERIMENTAL_ARROW will return all dataframes as `pyarrow.Table`s. The arrow API is still
88-
experimental and the arrow layout might change in a minor release.
89-
Accepts the OutputFormat as either OutputFormat enum values or as case-insensitive strings like "pandas"
90-
and "experimental_arrow".
91-
92-
arrow_string_format_default: Union[ArrowOutputStringFormat, "pa.DataType"] = ArrowOutputStringFormat.LARGE_STRING
93-
Controls the default string format used for `OutputFormat.EXPERIMENTAL_ARROW`.
94-
See documentation of `ArrowOutputStringFormat` for more information on the different options.
84+
Default output format for all read operations on libraries created from this `Arctic` instance.
85+
Can be overridden per library or per read operation.
86+
See `OutputFormat` documentation for details on available formats.
87+
88+
arrow_string_format_default: Union[ArrowOutputStringFormat, "pa.DataType"], default = ArrowOutputStringFormat.LARGE_STRING
89+
Default string column format when using `PYARROW` or `POLARS` output formats.
90+
Can be overridden per library or per read operation.
91+
See `ArrowOutputStringFormat` documentation for details on available string formats.
9592
9693
Examples
9794
--------
@@ -196,14 +193,16 @@ def get_library(
196193
Unused if create_if_missing is False.
197194
198195
output_format: Optional[Union[OutputFormat, str]], default = None
199-
Controls the default output format of all operations on the library returning a dataframe.
200-
For more information see documentation of `Arctic.__init__`.
201-
If `None` uses the output format from the Arctic instance.
196+
Default output format for all read operations on this library.
197+
If `None`, uses the output format from the `Arctic` instance.
198+
Can be overridden per read operation.
199+
See `OutputFormat` documentation for details on available formats.
202200
203201
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
204-
If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
205-
See documentation of `ArrowOutputStringFormat` for more information on the different options.
206-
If `None` uses the default arrow_string_format from the `Library` instance.
202+
Default string column format when using `PYARROW` or `POLARS` output formats on this library.
203+
If `None`, uses the `arrow_string_format_default` from the `Arctic` instance.
204+
Can be overridden per read operation.
205+
See `ArrowOutputStringFormat` documentation for details on available string formats.
207206
208207
Examples
209208
--------
@@ -268,14 +267,16 @@ def create_library(
268267
EnterpriseLibraryOptions. These options are only relevant to ArcticDB enterprise users.
269268
270269
output_format: Optional[Union[OutputFormat, str]], default = None
271-
Controls the default output format of all operations on the library returning a dataframe.
272-
For more information see documentation of `Arctic.__init__`.
273-
If `None` uses the output format from the Arctic instance.
270+
Default output format for all read operations on this library.
271+
If `None`, uses the output format from the `Arctic` instance.
272+
Can be overridden per read operation.
273+
See `OutputFormat` documentation for details on available formats.
274274
275275
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
276-
If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
277-
See documentation of `ArrowOutputStringFormat` for more information on the different options.
278-
If `None` uses the default arrow_string_format from the `Library` instance.
276+
Default string column format when using `PYARROW` or `POLARS` output formats on this library.
277+
If `None`, uses the `arrow_string_format_default` from the `Arctic` instance.
278+
Can be overridden per read operation.
279+
See `ArrowOutputStringFormat` documentation for details on available string formats.
279280
280281
Examples
281282
--------

python/arcticdb/options.py

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -169,21 +169,44 @@ def __repr__(self):
169169

170170
# TODO: Use enum.StrEnum when we no longer need to support python 3.9
171171
class OutputFormat(str, Enum):
172+
"""
173+
Controls the output format of operations which return dataframes. All APIs which take an `output_format` argument
174+
accept the enum values and case-insensitive strings. E.g. all of `OutputFormat.PYARROW`, `"PYARROW"`, `"pyarrow"`
175+
will be interpreted as `OutputFormat.PYARROW`.
176+
177+
PANDAS (default):
178+
Dataframes are returned as `pandas.DataFrame` or `pandas.Series` objects backed by numpy arrays.
179+
180+
PYARROW:
181+
Dataframes are returned as `pyarrow.Table` objects using Apache Arrow's columnar memory format.
182+
Both `PYARROW` and `POLARS` output formats use the same underlying Arrow memory layout, ensuring
183+
zero-copy interoperability between them.
184+
Provides better performance than `PANDAS`, especially for dataframes containing many string columns.
185+
String format can be customized via `ArrowOutputStringFormat`.
186+
187+
POLARS:
188+
Dataframes are returned as `polars.DataFrame` objects using Apache Arrow's columnar memory format.
189+
Both `PYARROW` and `POLARS` output formats use the same underlying Arrow memory layout, ensuring
190+
zero-copy interoperability between them.
191+
Provides better performance than `PANDAS`, especially for dataframes containing many string columns.
192+
String format can be customized via `ArrowOutputStringFormat`.
193+
"""
194+
172195
PANDAS = "PANDAS"
173-
EXPERIMENTAL_ARROW = "EXPERIMENTAL_ARROW"
174-
EXPERIMENTAL_POLARS = "EXPERIMENTAL_POLARS"
196+
PYARROW = "PYARROW"
197+
POLARS = "POLARS"
175198

176199

177200
def output_format_to_internal(output_format: Union[OutputFormat, str]) -> InternalOutputFormat:
178201
if output_format.lower() == OutputFormat.PANDAS.lower():
179202
return InternalOutputFormat.PANDAS
180-
elif output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower():
203+
elif output_format.lower() == OutputFormat.PYARROW.lower():
181204
if not _PYARROW_AVAILABLE:
182205
raise ModuleNotFoundError(
183206
"ArcticDB's pyarrow optional dependency missing but is required to use arrow output format."
184207
)
185208
return InternalOutputFormat.ARROW
186-
elif output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
209+
elif output_format.lower() == OutputFormat.POLARS.lower():
187210
if not _PYARROW_AVAILABLE or not _POLARS_AVAILABLE:
188211
raise ModuleNotFoundError(
189212
"ArcticDB's pyarrow or polars optional dependencies are missing but are required to use polars output format."
@@ -195,24 +218,30 @@ def output_format_to_internal(output_format: Union[OutputFormat, str]) -> Intern
195218

196219
class ArrowOutputStringFormat(str, Enum):
197220
"""
198-
Used to specify string format when output_format=OutputFormat.EXPERIMENTAL_ARROW.
199-
Arguments allow specifying either the enum value or the corresponding pyarrow.DataType
221+
Controls the string column format when using `PYARROW` or `POLARS` output formats.
222+
Accepts either the enum value or the corresponding `pyarrow.DataType`.
200223
201224
LARGE_STRING (default):
202-
Produces string columns with type `pa.large_string()`. Total length of strings must fit in a 64-bit integer.
203-
Does not deduplicate strings, so has better performance for columns with many unique strings.
225+
Uses 64-bit variable-size encoding.
226+
PyArrow: `pa.large_string()`, Polars: `pl.String`
227+
Supports up to 2⁶³-1 bytes total string length per Arrow array.
228+
Best for general-purpose use and when working with large string data.
204229
205230
SMALL_STRING:
206-
Produces string columns with type `pa.string()`. Total length of strings must fit in a 32-bit integer.
207-
Does not deduplicate strings, so has better performance for columns with many unique strings.
208-
Slightly faster than `LARGE_STRING` but does not work with very long strings.
231+
Uses 32-bit variable-size encoding.
232+
PyArrow: `pa.string()`, Polars: `pl.String`
233+
Supports up to 2³¹-1 bytes total string length per Arrow array.
234+
Slightly more memory efficient than `LARGE_STRING` when string data is known to be small.
209235
210236
CATEGORICAL and DICTIONARY_ENCODED:
211-
Both are different aliases for the same string format. Produces string columns with type
212-
`pa.dictionary(pa.int32(), pa.large_string())`. Total length of strings must fit in a 64-bit integer. Splitting in
213-
record batches guarantees that 32-bit dictionary keys are sufficient.
214-
Does deduplicate strings, so has better performance for columns with few unique strings.
215-
237+
Both are aliases for dictionary-encoded strings with int32 indices.
238+
PyArrow: `pa.dictionary(pa.int32(), pa.large_string())`, Polars: `pl.Categorical`
239+
Best for columns with low cardinality (few unique values repeated many times).
240+
Deduplicates strings, reducing memory usage and improving performance when the number of
241+
unique values is much smaller than the total number of rows.
242+
243+
For more details on physical layouts, see the Apache Arrow specification:
244+
https://arrow.apache.org/docs/format/Columnar.html
216245
"""
217246

218247
CATEGORICAL = "CATEGORICAL"

python/arcticdb/util/arrow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def stringify_dictionary_encoded_columns(table, string_type=None):
2525

2626
def convert_arrow_to_pandas_for_tests(table):
2727
"""
28-
Converts `pa.Table` outputted via `output_format=OutputFormat.EXPERIMENTAL_ARROW` to a `pd.DataFrame` so it would
28+
Converts `pa.Table` outputted via `output_format=OutputFormat.PYARROW` to a `pd.DataFrame` so it would
2929
be identical to the one outputted via `output_format=OutputFormat.PANDAS`. This requires the following changes:
3030
- Replaces dictionary encoded string columns with regular string columns.
3131
- Fills null values in int columns with zeros.

python/arcticdb/version_store/_normalization.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,6 @@ def generate_original_column_names():
792792
pandas_meta = norm_meta.df.common
793793
elif input_type == "series":
794794
# For pandas series we always return a dataframe (to not lose the index information).
795-
# TODO: Return a `pyarrow.Array` if index is not physically stored (Monday ref: 9360502457)
796795
pandas_meta = norm_meta.series.common
797796
elif input_type == "experimental_arrow":
798797
if norm_meta.experimental_arrow.has_index:

python/arcticdb/version_store/_store.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ def _set_allow_arrow_input(self, allow_arrow_input: bool = True):
385385

386386
def _set_output_format_for_pipeline_tests(self, output_format):
387387
self.set_output_format(output_format)
388-
if output_format == OutputFormat.EXPERIMENTAL_ARROW:
388+
if output_format == OutputFormat.PYARROW:
389389
self._test_convert_arrow_back_to_pandas = True
390390

391391
@classmethod
@@ -2728,8 +2728,8 @@ def _adapt_frame_data(self, frame_data, norm, output_format):
27282728
)
27292729
if self._test_convert_arrow_back_to_pandas:
27302730
data = convert_arrow_to_pandas_for_tests(data)
2731-
if output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
2732-
data = pl.from_arrow(data)
2731+
if output_format.lower() == OutputFormat.POLARS.lower():
2732+
data = pl.from_arrow(data, rechunk=False)
27332733
else:
27342734
data = self._normalizer.denormalize(frame_data, norm)
27352735
if norm.HasField("custom"):

python/arcticdb/version_store/library.py

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2026,17 +2026,17 @@ def read(
20262026
on `LazyDataFrame` for more details.
20272027
20282028
output_format: Optional[Union[OutputFormat, str]], default=None
2029-
Controls the output format of the result dataframe.
2030-
For more information see documentation of `Arctic.__init__`.
2031-
If `None` uses the default output format from the `Library` instance.
2029+
Output format for the returned dataframe.
2030+
If `None`, uses the output format from the `Library` instance.
2031+
See `OutputFormat` documentation for details on available formats.
20322032
20332033
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
2034-
If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
2035-
See documentation of `ArrowOutputStringFormat` for more information on the different options.
2036-
If `None` uses the default arrow_string_format from the `Library` instance.
2034+
String column format when using `PYARROW` or `POLARS` output formats.
2035+
If `None`, uses the `arrow_string_format_default` from the `Library` instance.
2036+
See `ArrowOutputStringFormat` documentation for details on available string formats.
20372037
20382038
arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None
2039-
Provides per column name overrides for `arrow_string_format_default`
2039+
Per-column overrides for `arrow_string_format_default`. Keys are column names.
20402040
20412041
Returns
20422042
-------
@@ -2063,10 +2063,9 @@ def read(
20632063
1 6
20642064
2 7
20652065
2066-
Passing an output_format can change the resulting dataframe type. E.g. we can use the experimental arrow output
2067-
format:
2066+
Passing an output_format can change the resulting dataframe type. For example, to return a PyArrow table:
20682067
2069-
>>> lib.read("symbol", output_format="EXPERIMENTAL_ARROW").data
2068+
>>> lib.read("symbol", output_format="PYARROW").data
20702069
pyarrow.Table
20712070
column: int64
20722071
----
@@ -2128,19 +2127,20 @@ def read_batch(
21282127
documentation on `LazyDataFrameCollection` for more details.
21292128
21302129
output_format: Optional[Union[OutputFormat, str]], default=None
2131-
Controls the output format of the result dataframes.
2132-
For more information see documentation of `Arctic.__init__`.
2133-
If `None` uses the default output format from the `Library` instance.
2130+
Output format for the returned dataframes.
2131+
If `None`, uses the output format from the `Library` instance.
2132+
See `OutputFormat` documentation for details on available formats.
21342133
21352134
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
2136-
If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
2137-
See documentation of `ArrowOutputStringFormat` for more information on the different options.
2138-
It serves as the default for the entire batch. The string format settings inside the `ReadRequest`s will
2139-
override this batch level setting.
2135+
String column format when using `PYARROW` or `POLARS` output formats.
2136+
Serves as the default for the entire batch. String format settings in individual `ReadRequest` objects
2137+
override this batch-level setting.
2138+
If `None`, uses the `arrow_string_format_default` from the `Library` instance.
2139+
See `ArrowOutputStringFormat` documentation for details on available string formats.
21402140
2141-
arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None,
2142-
Provides per column name overrides for `arrow_string_format_default`. It is only applied to symbols which
2143-
don't have a `arrow_string_format_per_column` set in their `ReadRequest`.
2141+
arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None
2142+
Per-column overrides for `arrow_string_format_default`. Keys are column names.
2143+
Only applied to symbols that don't have `arrow_string_format_per_column` set in their `ReadRequest`.
21442144
21452145
Returns
21462146
-------
@@ -2309,17 +2309,17 @@ def read_batch_and_join(
23092309
individual dataframes, and will be applied to the joined data.
23102310
23112311
output_format: Optional[Union[OutputFormat, str]], default=None
2312-
Controls the output format of the result dataframe.
2313-
For more information see documentation of `Arctic.__init__`.
2314-
If `None` uses the default output format from the `Library` instance.
2312+
Output format for the returned joined dataframe.
2313+
If `None`, uses the output format from the `Library` instance.
2314+
See `OutputFormat` documentation for details on available formats.
23152315
23162316
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
2317-
If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
2318-
See documentation of `ArrowOutputStringFormat` for more information on the different options.
2319-
If `None` uses the default arrow_string_format from the `Library` instance.
2317+
String column format when using `PYARROW` or `POLARS` output formats.
2318+
If `None`, uses the `arrow_string_format_default` from the `Library` instance.
2319+
See `ArrowOutputStringFormat` documentation for details on available string formats.
23202320
23212321
arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None
2322-
Provides per column name overrides for `arrow_string_format_default`
2322+
Per-column overrides for `arrow_string_format_default`. Keys are column names.
23232323
23242324
Returns
23252325
-------

python/benchmarks/arrow.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def setup_cache(self):
4141
self.logger.info(f"SETUP_CACHE TIME: {time.time() - start}")
4242

4343
def _setup_cache(self):
44-
self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)
44+
self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)
4545
num_rows, date_ranges = self.params
4646
num_cols = 9 # 10 including the index column
4747
self.ac.delete_library(self.lib_name_prewritten)
@@ -63,7 +63,7 @@ def teardown(self, rows, date_range):
6363
del self.ac
6464

6565
def setup(self, rows, date_range):
66-
self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)
66+
self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)
6767
self.lib = self.ac.get_library(self.lib_name_prewritten)
6868
self.lib._nvs._set_allow_arrow_input()
6969
if date_range is None:
@@ -126,7 +126,7 @@ def _generate_table(self, num_rows, num_cols, unique_string_count):
126126
return pa.Table.from_pandas(df)
127127

128128
def _setup_cache(self):
129-
self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)
129+
self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)
130130
num_rows, date_ranges, unique_string_counts, arrow_string_format = self.params
131131
self.ac.delete_library(self.lib_name_prewritten)
132132
self.ac.create_library(self.lib_name_prewritten)
@@ -145,7 +145,7 @@ def teardown(self, rows, date_range, unique_string_count, arrow_string_format):
145145
del self.ac
146146

147147
def setup(self, rows, date_range, unique_string_count, arrow_string_format):
148-
self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)
148+
self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)
149149
self.lib = self.ac.get_library(self.lib_name_prewritten)
150150
self.lib._nvs._set_allow_arrow_input()
151151
if date_range is None:

0 commit comments

Comments
 (0)