man-group · IvoDD · Nov 24, 2025 · Nov 28, 2025 · Nov 28, 2025 · alexowens90
diff --git a/docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb b/docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb
@@ -81,17 +81,14 @@ def __init__(
             Can be overridden by specifying the encoding version in the LibraryOptions argument to create_library.
 
         output_format: Union[OutputFormat, str], default = OutputFormat.PANDAS
-            Controls the default output format of all operations returning a dataframe.
-            The default behavior (OutputFormat.PANDAS) is to return `pandas.DataFrame`s or `pandas.Series` backed by
-            numpy arrays.
-            OutputFormat.EXPERIMENTAL_ARROW will return all dataframes as `pyarrow.Table`s. The arrow API is still
-            experimental and the arrow layout might change in a minor release.
-            Accepts the OutputFormat as either OutputFormat enum values or as case-insensitive strings like "pandas"
-            and "experimental_arrow".
-
-        arrow_string_format_default: Union[ArrowOutputStringFormat, "pa.DataType"] = ArrowOutputStringFormat.LARGE_STRING
-            Controls the default string format used for `OutputFormat.EXPERIMENTAL_ARROW`.
-            See documentation of `ArrowOutputStringFormat` for more information on the different options.
+            Default output format for all read operations on libraries created from this `Arctic` instance.
+            Can be overridden per library or per read operation.
+            See `OutputFormat` documentation for details on available formats.
+
+        arrow_string_format_default: Union[ArrowOutputStringFormat, "pa.DataType"], default = ArrowOutputStringFormat.LARGE_STRING
+            Default string column format when using `PYARROW` or `POLARS` output formats.
+            Can be overridden per library or per read operation.
+            See `ArrowOutputStringFormat` documentation for details on available string formats.
 
         Examples
         --------
@@ -196,14 +193,16 @@ def get_library(
             Unused if create_if_missing is False.
 
         output_format: Optional[Union[OutputFormat, str]], default = None
-            Controls the default output format of all operations on the library returning a dataframe.
-            For more information see documentation of `Arctic.__init__`.
-            If `None` uses the output format from the Arctic instance.
+            Default output format for all read operations on this library.
+            If `None`, uses the output format from the `Arctic` instance.
+            Can be overridden per read operation.
+            See `OutputFormat` documentation for details on available formats.
 
         arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
-            If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
-            See documentation of `ArrowOutputStringFormat` for more information on the different options.
-            If `None` uses the default arrow_string_format from the `Library` instance.
+            Default string column format when using `PYARROW` or `POLARS` output formats on this library.
+            If `None`, uses the `arrow_string_format_default` from the `Arctic` instance.
+            Can be overridden per read operation.
+            See `ArrowOutputStringFormat` documentation for details on available string formats.
 
         Examples
         --------
@@ -268,14 +267,17 @@ def create_library(
             EnterpriseLibraryOptions. These options are only relevant to ArcticDB enterprise users.
 
         output_format: Optional[Union[OutputFormat, str]], default = None
-            Controls the default output format of all operations on the library returning a dataframe.
-            For more information see documentation of `Arctic.__init__`.
-            If `None` uses the output format from the Arctic instance.
+            Default output format for all read operations on this library.
+            If `None`, uses the output format from the `Arctic` instance.
+            Can be overridden per read operation.
+            See `OutputFormat` documentation for details on available formats.
 
         arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
-            If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
-            See documentation of `ArrowOutputStringFormat` for more information on the different options.
-            If `None` uses the default arrow_string_format from the `Library` instance.
+            Default string column format when using `PYARROW` or `POLARS` output formats on this library.
+            If `None`, uses the `arrow_string_format_default` from the `Arctic` instance.
+            Can be overridden per read operation.
+            See `ArrowOutputStringFormat` documentation for details on available string formats.
+            Note that this setting is only applied to the runtime `Library` instance and is not stored as part of the library configuration.
 
         Examples
         --------

@@ -169,21 +169,40 @@ def __repr__(self):
 
 # TODO: Use enum.StrEnum when we no longer need to support python 3.9
 class OutputFormat(str, Enum):
+    """
+    Controls the output format of operations which return dataframes. All APIs which take an `output_format` argument
+    accept the enum values and case-insensitive strings. E.g. all of `OutputFormat.PYARROW`, `"PYARROW"`, `"pyarrow"`
+    will be interpreted as `OutputFormat.PYARROW`.
+
+    PANDAS (default):
+        Dataframes are returned as `pandas.DataFrame` or `pandas.Series` objects backed by numpy arrays.
+
+    PYARROW:
+        Dataframes are returned as `pyarrow.Table` objects using Apache Arrow's columnar memory format.
+        Provides better performance than `PANDAS`, especially for dataframes containing many string columns.
+        String format can be customized via `ArrowOutputStringFormat`.
+
+    POLARS:
+        Dataframes are returned as `polars.DataFrame` objects using Apache Arrow's columnar memory format.
+        Provides better performance than `PANDAS`, especially for dataframes containing many string columns.
+        String format can be customized via `ArrowOutputStringFormat`.
+    """
+
     PANDAS = "PANDAS"
-    EXPERIMENTAL_ARROW = "EXPERIMENTAL_ARROW"
-    EXPERIMENTAL_POLARS = "EXPERIMENTAL_POLARS"
+    PYARROW = "PYARROW"
+    POLARS = "POLARS"
 
 
 def output_format_to_internal(output_format: Union[OutputFormat, str]) -> InternalOutputFormat:
     if output_format.lower() == OutputFormat.PANDAS.lower():
         return InternalOutputFormat.PANDAS
-    elif output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower():
+    elif output_format.lower() == OutputFormat.PYARROW.lower():
         if not _PYARROW_AVAILABLE:
             raise ModuleNotFoundError(
                 "ArcticDB's pyarrow optional dependency missing but is required to use arrow output format."
             )
         return InternalOutputFormat.ARROW
-    elif output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
+    elif output_format.lower() == OutputFormat.POLARS.lower():
         if not _PYARROW_AVAILABLE or not _POLARS_AVAILABLE:
             raise ModuleNotFoundError(
                 "ArcticDB's pyarrow or polars optional dependencies are missing but are required to use polars output format."
@@ -195,24 +214,31 @@ def output_format_to_internal(output_format: Union[OutputFormat, str]) -> Intern
 
 class ArrowOutputStringFormat(str, Enum):
     """
-    Used to specify string format when output_format=OutputFormat.EXPERIMENTAL_ARROW.
-    Arguments allow specifying either the enum value or the corresponding pyarrow.DataType
+    Controls the string column format when using `PYARROW` or `POLARS` output formats.
+    Accepts either the enum value or the corresponding `pyarrow.DataType`.
 
     LARGE_STRING (default):
-    Produces string columns with type `pa.large_string()`. Total length of strings must fit in a 64-bit integer.
-    Does not deduplicate strings, so has better performance for columns with many unique strings.
+        Uses 64-bit variable-size encoding.
+        PyArrow: `pa.large_string()`, Polars: `pl.String`
+        Supports up to 2⁶³-1 bytes total string length per Arrow array.
+        Best for general-purpose use and when working with large string data.
 
     SMALL_STRING:
-    Produces string columns with type `pa.string()`. Total length of strings must fit in a 32-bit integer.
-    Does not deduplicate strings, so has better performance for columns with many unique strings.
-    Slightly faster than `LARGE_STRING` but does not work with very long strings.
+        Uses 32-bit variable-size encoding.
+        PyArrow: `pa.string()`, Polars: Not supported
+        Supports up to 2³¹-1 bytes total string length per Arrow array.
+        Only supported with PyArrow because Polars does not support small strings.
+        Slightly more memory efficient than `LARGE_STRING` when string data is known to be small.
 
     CATEGORICAL and DICTIONARY_ENCODED:
-    Both are different aliases for the same string format. Produces string columns with type
-    `pa.dictionary(pa.int32(), pa.large_string())`. Total length of strings must fit in a 64-bit integer.  Splitting in
-    record batches guarantees that 32-bit dictionary keys are sufficient.
-    Does deduplicate strings, so has better performance for columns with few unique strings.
-
+        Both are aliases for dictionary-encoded strings with int32 indices.
+        PyArrow: `pa.dictionary(pa.int32(), pa.large_string())`, Polars: `pl.Categorical`
+        Best for columns with low cardinality (few unique values repeated many times).
+        Deduplicates strings, reducing memory usage and improving performance when the number of
+        unique values is much smaller than the total number of rows.
+
+    For more details on physical layouts, see the Apache Arrow specification:
+    https://arrow.apache.org/docs/format/Columnar.html
     """
 
     CATEGORICAL = "CATEGORICAL"
@@ -222,7 +248,7 @@ class ArrowOutputStringFormat(str, Enum):
 
 
 def arrow_output_string_format_to_internal(
-    arrow_string_format: Union[ArrowOutputStringFormat, "pa.DataType"],
+    arrow_string_format: Union[ArrowOutputStringFormat, "pa.DataType"], output_format: Union[OutputFormat, str]
 ) -> InternalArrowOutputStringFormat:
     if (
         arrow_string_format == ArrowOutputStringFormat.CATEGORICAL
@@ -242,6 +268,10 @@ def arrow_output_string_format_to_internal(
         or _PYARROW_AVAILABLE
         and arrow_string_format == pa.string()
     ):
+        if output_format.lower() == OutputFormat.POLARS.lower():
+            raise ValueError(
+                "SMALL_STRING is not supported with POLARS output format. Please use LARGE_STRING instead."
+            )
         return InternalArrowOutputStringFormat.SMALL_STRING
     else:
         raise ValueError(f"Unkown ArrowOutputStringFormat: {arrow_string_format}")

@@ -25,7 +25,7 @@ def stringify_dictionary_encoded_columns(table, string_type=None):
 
 def convert_arrow_to_pandas_for_tests(table):
     """
-    Converts `pa.Table` outputted via `output_format=OutputFormat.EXPERIMENTAL_ARROW` to a `pd.DataFrame` so it would
+    Converts `pa.Table` outputted via `output_format=OutputFormat.PYARROW` to a `pd.DataFrame` so it would
     be identical to the one outputted via `output_format=OutputFormat.PANDAS`. This requires the following changes:
     - Replaces dictionary encoded string columns with regular string columns.
     - Fills null values in int columns with zeros.

@@ -792,7 +792,6 @@ def generate_original_column_names():
             pandas_meta = norm_meta.df.common
         elif input_type == "series":
             # For pandas series we always return a dataframe (to not lose the index information).
-            # TODO: Return a `pyarrow.Array` if index is not physically stored (Monday ref: 9360502457)
             pandas_meta = norm_meta.series.common
         elif input_type == "experimental_arrow":
             if norm_meta.experimental_arrow.has_index:

@@ -385,7 +385,7 @@ def _set_allow_arrow_input(self, allow_arrow_input: bool = True):
 
     def _set_output_format_for_pipeline_tests(self, output_format):
         self.set_output_format(output_format)
-        if output_format == OutputFormat.EXPERIMENTAL_ARROW:
+        if output_format == OutputFormat.PYARROW:
             self._test_convert_arrow_back_to_pandas = True
 
     @classmethod
@@ -1284,21 +1284,17 @@ def batch_read(
             For more information see the documentation for the QueryBuilder class.
             i-th entry corresponds to i-th element of `symbols`.
         arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
-            Controls the default string format used for `ARROW` or `POLARS` output format.
-            See documentation of `ArrowOutputStringFormat` for more information on the different options.
-            It serves as the default for the entire batch.
-        arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None,
-            Controls the string format per column used for `ARROW` or `POLARS` output format.
-            See documentation of `ArrowOutputStringFormat` for more information on the different options.
-            It is applied to all symbols which don't have a `per_symbol_arrow_string_format_per_column` set.
-        per_symbol_arrow_string_format_default: Optional[List[Optional[Union[ArrowOutputStringFormat, "pa.DataType"]]]], default=None,
-            Controls the string format per column used for `ARROW` or `POLARS` output format.
-            See documentation of `ArrowOutputStringFormat` for more information on the different options.
-            It serves as the default per symbol. It overrides the global `arrow_string_format_default` setting
-        per_symbol_arrow_string_format_per_column: Optional[List[Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]]]], default=None,
-            Controls the string format per column used for `ARROW` or `POLARS` output format.
-            See documentation of `ArrowOutputStringFormat` for more information on the different options.
-            It defines the setting per symbol and per column. It overrides all other string format settings.
+            String column format when using `PYARROW` or `POLARS` output formats. Serves as the default for the entire batch.
+            See `ArrowOutputStringFormat` documentation for details on available string formats.
+        arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None
+            Per-column overrides for `arrow_string_format_default`. Keys are column names.
+            Applied to all symbols without a `per_symbol_arrow_string_format_per_column` set.
+        per_symbol_arrow_string_format_default: Optional[List[Optional[Union[ArrowOutputStringFormat, "pa.DataType"]]]], default=None
+            Per-symbol override for `arrow_string_format_default`. Overrides the batch-level default.
+            i-th entry corresponds to i-th element of `symbols`.
+        per_symbol_arrow_string_format_per_column: Optional[List[Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]]]], default=None
+            Per-symbol, per-column overrides. Takes precedence over all other string format settings.
+            i-th entry corresponds to i-th element of `symbols`.
 
         Examples
         --------
@@ -2159,12 +2155,13 @@ def _get_read_options_and_output_format(
                         proto_cfg,
                         global_default=ArrowOutputStringFormat.LARGE_STRING,
                         **kwargs,
-                    )
+                    ),
+                    output_format,
                 )
             )
             read_options.set_arrow_output_per_column_string_format(
                 {
-                    key: arrow_output_string_format_to_internal(value)
+                    key: arrow_output_string_format_to_internal(value, output_format)
                     for key, value in resolve_defaults(
                         "arrow_string_format_per_column", proto_cfg, global_default={}, **kwargs
                     ).items()
@@ -2728,8 +2725,8 @@ def _adapt_frame_data(self, frame_data, norm, output_format):
                 )
             if self._test_convert_arrow_back_to_pandas:
                 data = convert_arrow_to_pandas_for_tests(data)
-            if output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
-                data = pl.from_arrow(data)
+            if output_format.lower() == OutputFormat.POLARS.lower():
+                data = pl.from_arrow(data, rechunk=False)
         else:
             data = self._normalizer.denormalize(frame_data, norm)
             if norm.HasField("custom"):