diff --git a/docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb b/docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb
new file mode 100644
index 0000000000..726df25d7d
--- /dev/null
+++ b/docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb
@@ -0,0 +1,2321 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "d6c62b62-3509-4c2b-9d6f-bc3dc0187c72",
+ "metadata": {},
+ "source": [
+ "# ArcticDB Read as Arrow demo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6a677f71-55d8-446d-a372-8b5bb0bc7e02",
+ "metadata": {},
+ "source": [
+ "This notebook demonstrates ArcticDB's Arrow-based output formats: **PyArrow** and **Polars**. These formats provide performance improvements for string-heavy workloads, and allow better integrations with modern Arrow-based table processing libraries like `polars` and `duckdb`.\n",
+ "\n",
+ "**Key benefits:**\n",
+ "- **Better performance**: Particularly for string columns (no GIL required)\n",
+ "- **Zero-copy integration with Arrow**: Zero-copy pass dataframes from `arctidb` to third party libraries like `duckdb`\n",
+ "\n",
+ "**Notebook structure:**\n",
+ "1. Setup and basic usage\n",
+ "2. Polars output format\n",
+ "3. PyArrow output format and record batch structure\n",
+ "4. Configurable string formats\n",
+ "5. Pandas interoperability and column naming\n",
+ "6. Performance benchmarks"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ec71d190-49c7-4d24-bea9-c27b72f5a2b6",
+ "metadata": {},
+ "source": [
+ "# Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9b93432d-a9f5-4e32-8b39-be7cdcb382eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import polars as pl\n",
+ "import pyarrow as pa\n",
+ "from arcticdb import Arctic, LibraryOptions, OutputFormat, ArrowOutputStringFormat, where"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "8df3c551-cbaf-4401-b9e7-427b25762a5f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ac = Arctic(\"lmdb://tmp/arrow_reads_demo\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "242dfd1e-da6c-4186-abf2-27c6ccac922f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ac.delete_library(\"arrow\")\n",
+ "lib = ac.create_library(\"arrow\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "174660b5-0c46-4152-b5c3-5e49a6ec76d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sym = \"test\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bd6573bb-1bbf-42ac-9c0c-0c99fa6df328",
+ "metadata": {},
+ "source": [
+ "# Polars Output Format\n",
+ "\n",
+ "All ArcticDB operations returning dataframes (e.g. `read`, `head`, `tail`, `read_batch`, `read_batch_and_join`) accept an `output_format` which controls in which format will the data be returned. The default output format is still `OutputFormat.PANDAS` which return `pd.DataFrame`s as before.\n",
+ "\n",
+ "The Apache Arrow memory layout based output formats are `POLARS` which returns `polars.DataFrame` objects and `PYARROW` which returns `pyarrow.Table` objects.\n",
+ "\n",
+ "Let's see several examples of using `POLARS` output format"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "6dd149ac-71c8-42a9-993e-10596975e161",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " col_int | \n",
+ " col_float | \n",
+ " col_str | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 10.0 | \n",
+ " value_0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ " 11.0 | \n",
+ " value_1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ " 12.0 | \n",
+ " value_2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ " 13.0 | \n",
+ " value_3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ " 14.0 | \n",
+ " value_4 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 5 | \n",
+ " 15.0 | \n",
+ " value_5 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 6 | \n",
+ " 16.0 | \n",
+ " value_6 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 7 | \n",
+ " 17.0 | \n",
+ " value_7 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 8 | \n",
+ " 18.0 | \n",
+ " value_8 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 9 | \n",
+ " 19.0 | \n",
+ " value_9 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " col_int col_float col_str\n",
+ "0 0 10.0 value_0\n",
+ "1 1 11.0 value_1\n",
+ "2 2 12.0 value_2\n",
+ "3 3 13.0 value_3\n",
+ "4 4 14.0 value_4\n",
+ "5 5 15.0 value_5\n",
+ "6 6 16.0 value_6\n",
+ "7 7 17.0 value_7\n",
+ "8 8 18.0 value_8\n",
+ "9 9 19.0 value_9"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## Basic Usage\n",
+ "\n",
+ "# Write some data with Pandas (writing Polars/Arrow directly is not yet supported)\n",
+ "df = pd.DataFrame({\n",
+ " \"col_int\": np.arange(10, dtype=np.int64),\n",
+ " \"col_float\": np.arange(10, 20, dtype=np.float64),\n",
+ " \"col_str\": [f\"value_{i}\" for i in range(10)]\n",
+ "})\n",
+ "lib.write(\"demo\", df)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0b8ef49-3b92-4805-872a-eb0875959f87",
+ "metadata": {},
+ "source": [
+ "Read back as a Polars DataFrame:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "df1c3b81-9727-4a3e-8e76-31878c82a4a7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Type: \n",
+ "Dtypes: [Int64, Float64, String]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (10, 3)| col_int | col_float | col_str |
|---|
| i64 | f64 | str |
| 0 | 10.0 | "value_0" |
| 1 | 11.0 | "value_1" |
| 2 | 12.0 | "value_2" |
| 3 | 13.0 | "value_3" |
| 4 | 14.0 | "value_4" |
| 5 | 15.0 | "value_5" |
| 6 | 16.0 | "value_6" |
| 7 | 17.0 | "value_7" |
| 8 | 18.0 | "value_8" |
| 9 | 19.0 | "value_9" |
"
+ ],
+ "text/plain": [
+ "shape: (10, 3)\n",
+ "┌─────────┬───────────┬─────────┐\n",
+ "│ col_int ┆ col_float ┆ col_str │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ f64 ┆ str │\n",
+ "╞═════════╪═══════════╪═════════╡\n",
+ "│ 0 ┆ 10.0 ┆ value_0 │\n",
+ "│ 1 ┆ 11.0 ┆ value_1 │\n",
+ "│ 2 ┆ 12.0 ┆ value_2 │\n",
+ "│ 3 ┆ 13.0 ┆ value_3 │\n",
+ "│ 4 ┆ 14.0 ┆ value_4 │\n",
+ "│ 5 ┆ 15.0 ┆ value_5 │\n",
+ "│ 6 ┆ 16.0 ┆ value_6 │\n",
+ "│ 7 ┆ 17.0 ┆ value_7 │\n",
+ "│ 8 ┆ 18.0 ┆ value_8 │\n",
+ "│ 9 ┆ 19.0 ┆ value_9 │\n",
+ "└─────────┴───────────┴─────────┘"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "polars_df = lib.read(\"demo\", output_format=OutputFormat.POLARS).data\n",
+ "print(f\"Type: {type(polars_df)}\")\n",
+ "print(f\"Dtypes: {polars_df.dtypes}\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e8de754e-90c7-47bb-a12b-33495fad0d83",
+ "metadata": {},
+ "source": [
+ "### Index Handling\n",
+ "\n",
+ "Note that default `RangeIndex` is dropped (Polars has no concept of row indexes):"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8f7dce55-bb59-4830-a8b6-ade87d5ef755",
+ "metadata": {},
+ "source": [
+ "## Configuration Levels\n",
+ "\n",
+ "Output format can be set at three levels (Arctic instance, Library, or per-read):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "960cc5b2-9f5c-49ba-9a40-be55c6ae0ca3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| col_int | col_float | col_str |
|---|
| i64 | f64 | str |
| 0 | 10.0 | "value_0" |
| 1 | 11.0 | "value_1" |
| 2 | 12.0 | "value_2" |
| 3 | 13.0 | "value_3" |
| 4 | 14.0 | "value_4" |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌─────────┬───────────┬─────────┐\n",
+ "│ col_int ┆ col_float ┆ col_str │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ f64 ┆ str │\n",
+ "╞═════════╪═══════════╪═════════╡\n",
+ "│ 0 ┆ 10.0 ┆ value_0 │\n",
+ "│ 1 ┆ 11.0 ┆ value_1 │\n",
+ "│ 2 ┆ 12.0 ┆ value_2 │\n",
+ "│ 3 ┆ 13.0 ┆ value_3 │\n",
+ "│ 4 ┆ 14.0 ┆ value_4 │\n",
+ "└─────────┴───────────┴─────────┘"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Per-read (most granular)\n",
+ "polars_df = lib.read(\"demo\", output_format=OutputFormat.POLARS).data\n",
+ "\n",
+ "# Case-insensitive strings also work\n",
+ "polars_df = lib.head(\"demo\", output_format=\"polars\").data\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9faf94c3-bde3-440b-9bfb-799e9f0602b8",
+ "metadata": {},
+ "source": [
+ "Set at library level:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "e4c1a7cf-f223-4273-84f2-9d3c8dcce21a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| col_int | col_float | col_str |
|---|
| i64 | f64 | str |
| 0 | 10.0 | "value_0" |
| 1 | 11.0 | "value_1" |
| 2 | 12.0 | "value_2" |
| 3 | 13.0 | "value_3" |
| 4 | 14.0 | "value_4" |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌─────────┬───────────┬─────────┐\n",
+ "│ col_int ┆ col_float ┆ col_str │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ f64 ┆ str │\n",
+ "╞═════════╪═══════════╪═════════╡\n",
+ "│ 0 ┆ 10.0 ┆ value_0 │\n",
+ "│ 1 ┆ 11.0 ┆ value_1 │\n",
+ "│ 2 ┆ 12.0 ┆ value_2 │\n",
+ "│ 3 ┆ 13.0 ┆ value_3 │\n",
+ "│ 4 ┆ 14.0 ┆ value_4 │\n",
+ "└─────────┴───────────┴─────────┘"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "lib_polars = ac.get_library(\"arrow\", output_format=OutputFormat.POLARS)\n",
+ "# Now all reads default to Polars\n",
+ "lib_polars.head(\"demo\").data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5bdcc95b-5dee-4bc6-9ba3-026ef02a852a",
+ "metadata": {},
+ "source": [
+ "### Or on the entire `Arctic` instance, so all libraries fetched from this instance use Arrow as the default return type"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "bcb30551-b293-4d76-be14-24c9147f35f9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| col_int | col_float | col_str |
|---|
| i64 | f64 | str |
| 0 | 10.0 | "value_0" |
| 1 | 11.0 | "value_1" |
| 2 | 12.0 | "value_2" |
| 3 | 13.0 | "value_3" |
| 4 | 14.0 | "value_4" |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌─────────┬───────────┬─────────┐\n",
+ "│ col_int ┆ col_float ┆ col_str │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ f64 ┆ str │\n",
+ "╞═════════╪═══════════╪═════════╡\n",
+ "│ 0 ┆ 10.0 ┆ value_0 │\n",
+ "│ 1 ┆ 11.0 ┆ value_1 │\n",
+ "│ 2 ┆ 12.0 ┆ value_2 │\n",
+ "│ 3 ┆ 13.0 ┆ value_3 │\n",
+ "│ 4 ┆ 14.0 ┆ value_4 │\n",
+ "└─────────┴───────────┴─────────┘"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "polars_ac = Arctic(\"lmdb://tmp/arrow_reads_demo_2\", output_format=OutputFormat.POLARS)\n",
+ "\n",
+ "# Now all libraries will have a default polars output format\n",
+ "polars_ac.delete_library(\"arrow\")\n",
+ "lib = polars_ac.create_library(\"arrow\")\n",
+ "lib.write(\"demo\", df)\n",
+ "lib.head(\"demo\").data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5e5d9e23-1827-4010-94fa-b5464447eefb",
+ "metadata": {},
+ "source": [
+ "You can always override back to Pandas for individual reads:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "e99eebb4-d9ae-4872-a161-abff22be81f6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "pandas.core.frame.DataFrame"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pandas_df = lib.read(\"demo\", output_format=OutputFormat.PANDAS).data\n",
+ "type(pandas_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8b7b21b0-17ac-41ab-b22e-dcc7bec13eda",
+ "metadata": {},
+ "source": [
+ "## Working with timeseries indices\n",
+ "\n",
+ "Timeseries indices appear as the first column:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "4f714e04-f40e-4035-b8f8-4ef1602aae0b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " col_int | \n",
+ " col_float | \n",
+ "
\n",
+ " \n",
+ " | timestamp | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2025-01-01 | \n",
+ " 0 | \n",
+ " 10.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-02 | \n",
+ " 1 | \n",
+ " 11.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-03 | \n",
+ " 2 | \n",
+ " 12.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-04 | \n",
+ " 3 | \n",
+ " 13.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-05 | \n",
+ " 4 | \n",
+ " 14.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-06 | \n",
+ " 5 | \n",
+ " 15.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-07 | \n",
+ " 6 | \n",
+ " 16.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-08 | \n",
+ " 7 | \n",
+ " 17.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-09 | \n",
+ " 8 | \n",
+ " 18.0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-10 | \n",
+ " 9 | \n",
+ " 19.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " col_int col_float\n",
+ "timestamp \n",
+ "2025-01-01 0 10.0\n",
+ "2025-01-02 1 11.0\n",
+ "2025-01-03 2 12.0\n",
+ "2025-01-04 3 13.0\n",
+ "2025-01-05 4 14.0\n",
+ "2025-01-06 5 15.0\n",
+ "2025-01-07 6 16.0\n",
+ "2025-01-08 7 17.0\n",
+ "2025-01-09 8 18.0\n",
+ "2025-01-10 9 19.0"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.DataFrame({\n",
+ " \"col_int\": np.arange(10, dtype=np.int64),\n",
+ " \"col_float\": np.arange(10, 20, dtype=np.float64)\n",
+ "}, index=pd.date_range(\"2025-01-01\", periods=10))\n",
+ "df.index.name = \"timestamp\"\n",
+ "lib.write(\"timeseries\", df)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "a04c8ea1-0a44-44c6-bc27-32ae8b3d5872",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Columns: ['timestamp', 'col_int', 'col_float']\n",
+ "Dtypes: [Datetime(time_unit='ns', time_zone=None), Int64, Float64]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (10, 3)| timestamp | col_int | col_float |
|---|
| datetime[ns] | i64 | f64 |
| 2025-01-01 00:00:00 | 0 | 10.0 |
| 2025-01-02 00:00:00 | 1 | 11.0 |
| 2025-01-03 00:00:00 | 2 | 12.0 |
| 2025-01-04 00:00:00 | 3 | 13.0 |
| 2025-01-05 00:00:00 | 4 | 14.0 |
| 2025-01-06 00:00:00 | 5 | 15.0 |
| 2025-01-07 00:00:00 | 6 | 16.0 |
| 2025-01-08 00:00:00 | 7 | 17.0 |
| 2025-01-09 00:00:00 | 8 | 18.0 |
| 2025-01-10 00:00:00 | 9 | 19.0 |
"
+ ],
+ "text/plain": [
+ "shape: (10, 3)\n",
+ "┌─────────────────────┬─────────┬───────────┐\n",
+ "│ timestamp ┆ col_int ┆ col_float │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ datetime[ns] ┆ i64 ┆ f64 │\n",
+ "╞═════════════════════╪═════════╪═══════════╡\n",
+ "│ 2025-01-01 00:00:00 ┆ 0 ┆ 10.0 │\n",
+ "│ 2025-01-02 00:00:00 ┆ 1 ┆ 11.0 │\n",
+ "│ 2025-01-03 00:00:00 ┆ 2 ┆ 12.0 │\n",
+ "│ 2025-01-04 00:00:00 ┆ 3 ┆ 13.0 │\n",
+ "│ 2025-01-05 00:00:00 ┆ 4 ┆ 14.0 │\n",
+ "│ 2025-01-06 00:00:00 ┆ 5 ┆ 15.0 │\n",
+ "│ 2025-01-07 00:00:00 ┆ 6 ┆ 16.0 │\n",
+ "│ 2025-01-08 00:00:00 ┆ 7 ┆ 17.0 │\n",
+ "│ 2025-01-09 00:00:00 ┆ 8 ┆ 18.0 │\n",
+ "│ 2025-01-10 00:00:00 ┆ 9 ┆ 19.0 │\n",
+ "└─────────────────────┴─────────┴───────────┘"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "polars_df = lib.read(\"timeseries\").data\n",
+ "print(f\"Columns: {polars_df.columns}\")\n",
+ "print(f\"Dtypes: {polars_df.dtypes}\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5ede9724-bcd1-4502-8880-f2a7c9f1159f",
+ "metadata": {},
+ "source": [
+ "Timezones are preserved:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "cfc90885-1ae3-4df6-8c0c-d8a90e1ada0f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " value | \n",
+ "
\n",
+ " \n",
+ " | timestamp | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2025-01-01 00:00:00-05:00 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-02 00:00:00-05:00 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-03 00:00:00-05:00 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-04 00:00:00-05:00 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-05 00:00:00-05:00 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " value\n",
+ "timestamp \n",
+ "2025-01-01 00:00:00-05:00 0\n",
+ "2025-01-02 00:00:00-05:00 1\n",
+ "2025-01-03 00:00:00-05:00 2\n",
+ "2025-01-04 00:00:00-05:00 3\n",
+ "2025-01-05 00:00:00-05:00 4"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_tz = pd.DataFrame({\n",
+ " \"value\": np.arange(5)\n",
+ "}, index=pd.date_range(\"2025-01-01\", periods=5, tz=\"America/New_York\"))\n",
+ "df_tz.index.name = \"timestamp\"\n",
+ "lib.write(\"tz_data\", df_tz)\n",
+ "df_tz"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "bfc4d467-8c42-4a60-b84b-b57a8a4ace1b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Timezone preserved: Datetime(time_unit='ns', time_zone='America/New_York')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 2)| timestamp | value |
|---|
| datetime[ns, America/New_York] | i64 |
| 2025-01-01 00:00:00 EST | 0 |
| 2025-01-02 00:00:00 EST | 1 |
| 2025-01-03 00:00:00 EST | 2 |
| 2025-01-04 00:00:00 EST | 3 |
| 2025-01-05 00:00:00 EST | 4 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 2)\n",
+ "┌────────────────────────────────┬───────┐\n",
+ "│ timestamp ┆ value │\n",
+ "│ --- ┆ --- │\n",
+ "│ datetime[ns, America/New_York] ┆ i64 │\n",
+ "╞════════════════════════════════╪═══════╡\n",
+ "│ 2025-01-01 00:00:00 EST ┆ 0 │\n",
+ "│ 2025-01-02 00:00:00 EST ┆ 1 │\n",
+ "│ 2025-01-03 00:00:00 EST ┆ 2 │\n",
+ "│ 2025-01-04 00:00:00 EST ┆ 3 │\n",
+ "│ 2025-01-05 00:00:00 EST ┆ 4 │\n",
+ "└────────────────────────────────┴───────┘"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "polars_df = lib.read(\"tz_data\").data\n",
+ "print(f\"Timezone preserved: {polars_df['timestamp'].dtype}\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9c21d95e-d57e-4e1c-8b67-fb7060a5f39b",
+ "metadata": {},
+ "source": [
+ "## Filtering and Column Selection\n",
+ "\n",
+ "Standard ArcticDB operations work with Arrow output formats:\n",
+ "\n",
+ "*Note that index column is fetched even if not specified in the `columns` list.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "e978dfd1-9164-4875-9c11-6b5d258fa81c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (4, 2)| timestamp | col_int |
|---|
| datetime[ns] | i64 |
| 2025-01-03 00:00:00 | 2 |
| 2025-01-04 00:00:00 | 3 |
| 2025-01-05 00:00:00 | 4 |
| 2025-01-06 00:00:00 | 5 |
"
+ ],
+ "text/plain": [
+ "shape: (4, 2)\n",
+ "┌─────────────────────┬─────────┐\n",
+ "│ timestamp ┆ col_int │\n",
+ "│ --- ┆ --- │\n",
+ "│ datetime[ns] ┆ i64 │\n",
+ "╞═════════════════════╪═════════╡\n",
+ "│ 2025-01-03 00:00:00 ┆ 2 │\n",
+ "│ 2025-01-04 00:00:00 ┆ 3 │\n",
+ "│ 2025-01-05 00:00:00 ┆ 4 │\n",
+ "│ 2025-01-06 00:00:00 ┆ 5 │\n",
+ "└─────────────────────┴─────────┘"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "polars_df = lib.read(\n",
+ " \"timeseries\",\n",
+ " date_range=(pd.Timestamp(\"2025-01-03\"), pd.Timestamp(\"2025-01-06\")),\n",
+ " columns=[\"col_int\"]\n",
+ ").data\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5c15e500",
+ "metadata": {},
+ "source": [
+ "# PyArrow Output Format\n",
+ "\n",
+ "The PyArrow output format returns `pyarrow.Table` objects. While we recommend Polars for most arrow use cases, PyArrow is useful when you want to integrate with other Arrow-based tools which expect a `pyarrow.Table`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "894b94d0",
+ "metadata": {},
+ "source": [
+ "## Basic PyArrow Usage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "19f5934d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Type: \n",
+ "Schema: col_int: int64\n",
+ "col_float: double\n",
+ "col_str: large_string\n",
+ "-- schema metadata --\n",
+ "pandas: '{\"index_columns\": [{\"name\": null, \"start\": 0, \"step\": 1, \"stop\":' + 500\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "pyarrow.Table\n",
+ "col_int: int64\n",
+ "col_float: double\n",
+ "col_str: large_string\n",
+ "----\n",
+ "col_int: [[0,1,2,3,4,5,6,7,8,9]]\n",
+ "col_float: [[10,11,12,13,14,15,16,17,18,19]]\n",
+ "col_str: [[\"value_0\",\"value_1\",\"value_2\",\"value_3\",\"value_4\",\"value_5\",\"value_6\",\"value_7\",\"value_8\",\"value_9\"]]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "arrow_table = lib.read(\"demo\", output_format=OutputFormat.PYARROW).data\n",
+ "print(f\"Type: {type(arrow_table)}\")\n",
+ "print(f\"Schema: {arrow_table.schema}\")\n",
+ "arrow_table"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c08b82c0",
+ "metadata": {},
+ "source": [
+ "## Zero-Copy Conversion Between Arrow based libraries\n",
+ "\n",
+ "Converting between PyArrow and Polars is zero-copy since they share the same memory layout:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "64d70f8f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Converted to Polars: \n",
+ "Converted back to Arrow: \n",
+ "Tables equal: True\n"
+ ]
+ }
+ ],
+ "source": [
+ "# PyArrow to Polars (zero-copy)\n",
+ "polars_from_arrow = pl.from_arrow(arrow_table)\n",
+ "print(f\"Converted to Polars: {type(polars_from_arrow)}\")\n",
+ "\n",
+ "# Polars to PyArrow (zero-copy)\n",
+ "arrow_from_polars = polars_from_arrow.to_arrow()\n",
+ "print(f\"Converted back to Arrow: {type(arrow_from_polars)}\")\n",
+ "print(f\"Tables equal: {arrow_table.equals(arrow_from_polars)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "7a9e1551",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Result type: \n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (4, 4)| timestamp | col_int | col_float | product |
|---|
| datetime[ns] | i64 | f64 | f64 |
| 2025-01-07 00:00:00 | 6 | 16.0 | 96.0 |
| 2025-01-08 00:00:00 | 7 | 17.0 | 119.0 |
| 2025-01-09 00:00:00 | 8 | 18.0 | 144.0 |
| 2025-01-10 00:00:00 | 9 | 19.0 | 171.0 |
"
+ ],
+ "text/plain": [
+ "shape: (4, 4)\n",
+ "┌─────────────────────┬─────────┬───────────┬─────────┐\n",
+ "│ timestamp ┆ col_int ┆ col_float ┆ product │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ datetime[ns] ┆ i64 ┆ f64 ┆ f64 │\n",
+ "╞═════════════════════╪═════════╪═══════════╪═════════╡\n",
+ "│ 2025-01-07 00:00:00 ┆ 6 ┆ 16.0 ┆ 96.0 │\n",
+ "│ 2025-01-08 00:00:00 ┆ 7 ┆ 17.0 ┆ 119.0 │\n",
+ "│ 2025-01-09 00:00:00 ┆ 8 ┆ 18.0 ┆ 144.0 │\n",
+ "│ 2025-01-10 00:00:00 ┆ 9 ┆ 19.0 ┆ 171.0 │\n",
+ "└─────────────────────┴─────────┴───────────┴─────────┘"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import duckdb\n",
+ "\n",
+ "# Load data from ArcticDB as PyArrow table\n",
+ "arrow_table = lib.read(\"timeseries\", output_format=OutputFormat.PYARROW).data\n",
+ "\n",
+ "# Query directly with DuckDB (zero-copy)\n",
+ "result = duckdb.query(\"\"\"\n",
+ " SELECT \n",
+ " timestamp,\n",
+ " col_int,\n",
+ " col_float,\n",
+ " col_int * col_float AS product\n",
+ " FROM arrow_table\n",
+ " WHERE col_int > 5\n",
+ " ORDER BY timestamp\n",
+ "\"\"\").to_arrow_table()\n",
+ "\n",
+ "print(f\"Result type: {type(result)}\")\n",
+ "# Converting to polars for easier viewing\n",
+ "pl.from_arrow(result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab9bc734-89ed-4683-b9e5-6ce0d5583dfd",
+ "metadata": {},
+ "source": [
+ "## Record Batch Structure\n",
+ "\n",
+ "Large writes, appends, and updates result in Arrow based output formats composed of multiple record batches. Each record batch corresponds to a row-slice in ArcticDB's storage.\n",
+ "This applies to both Pyarrow and Polars. With pyarrow we can see the record batch separation directly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "7e57da3b-9156-4433-9188-80a85ae07955",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of record batches: 2\n",
+ "Table structure: pyarrow.Table\n",
+ "col1: int64\n",
+ "----\n",
+ "col1: [[0,1],[2,3]]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "pyarrow.Table\n",
+ "col1: int64\n",
+ "----\n",
+ "col1: [[0,1],[2,3]]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_0 = pd.DataFrame({\"col1\": np.arange(2, dtype=np.int64)})\n",
+ "df_1 = pd.DataFrame({\"col1\": np.arange(2, 4, dtype=np.int64)})\n",
+ "lib.write(\"multi_batch\", df_0)\n",
+ "lib.append(\"multi_batch\", df_1)\n",
+ "\n",
+ "arrow_table = lib.read(\"multi_batch\", output_format=OutputFormat.PYARROW).data\n",
+ "print(f\"Number of record batches: {len(arrow_table.to_batches())}\")\n",
+ "print(f\"Table structure: {arrow_table}\")\n",
+ "arrow_table"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5c2e0cce-9c0a-4bc9-b9f8-0f2e56c2ab56",
+ "metadata": {},
+ "source": [
+ "You can combine chunks into a single contiguous table (involves memory allocation and copying):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "5ca9a985-e9fd-4b23-a330-4ec2d41e7c22",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Combined equals original: True\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "pyarrow.Table\n",
+ "col1: int64\n",
+ "----\n",
+ "col1: [[0,1,2,3]]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contiguous_table = arrow_table.combine_chunks()\n",
+ "print(f\"Combined equals original: {contiguous_table.equals(arrow_table)}\")\n",
+ "contiguous_table"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6886794a",
+ "metadata": {},
+ "source": [
+ "Chunks can also be combined with polars output format with `rechunk`.\n",
+ "\n",
+ "After rechunking some computations on the polars frame will be faster."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "bcaf366c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of chunks in Polars DataFrame: 2\n",
+ "Number of chunks after rechunking: 1\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ "shape: (4, 1)\n",
+ "┌──────┐\n",
+ "│ col1 │\n",
+ "│ --- │\n",
+ "│ i64 │\n",
+ "╞══════╡\n",
+ "│ 0 │\n",
+ "│ 1 │\n",
+ "│ 2 │\n",
+ "│ 3 │\n",
+ "└──────┘"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "polars_df = lib.read(\"multi_batch\", output_format=OutputFormat.POLARS).data\n",
+ "print(\"Number of chunks in Polars DataFrame:\", polars_df.n_chunks())\n",
+ "polars_df = polars_df.rechunk()\n",
+ "print(\"Number of chunks after rechunking:\", polars_df.n_chunks())\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "36424fda",
+ "metadata": {},
+ "source": [
+ "# Configurable String Formats\n",
+ "\n",
+ "Arrow-based output formats support configurable string encoding to optimize for your data characteristics. Three formats are available: `LARGE_STRING` (default), `SMALL_STRING` (only for PyArrow), and `CATEGORICAL`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f886c2d3",
+ "metadata": {},
+ "source": [
+ "## String Format Options\n",
+ "\n",
+ "- **`LARGE_STRING`** (default): 64-bit variable-size encoding, best for general use\n",
+ " - PyArrow: `pa.large_string()`, Polars: `pl.String`\n",
+ " \n",
+ "- **`SMALL_STRING`**: 32-bit variable-size encoding, slightly more memory efficient for smaller data\n",
+ " - PyArrow: `pa.string()`, Polars: Not supported\n",
+ " - Polars can only use large_strings or categoricals\n",
+ " \n",
+ "- **`CATEGORICAL`**: Dictionary-encoded, best for low cardinality (few unique values)\n",
+ " - PyArrow: `pa.dictionary(pa.int32(), pa.large_string())`, Polars: `pl.Categorical`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5695dd5c",
+ "metadata": {},
+ "source": [
+ "## Example: Default LARGE_STRING Format"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "dbf7608d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dtypes: [String, String]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (10, 2)| category | description |
|---|
| str | str |
| "A" | "Long description text 0" |
| "B" | "Long description text 1" |
| "A" | "Long description text 2" |
| "B" | "Long description text 3" |
| "A" | "Long description text 4" |
| "A" | "Long description text 5" |
| "B" | "Long description text 6" |
| "A" | "Long description text 7" |
| "B" | "Long description text 8" |
| "A" | "Long description text 9" |
"
+ ],
+ "text/plain": [
+ "shape: (10, 2)\n",
+ "┌──────────┬─────────────────────────┐\n",
+ "│ category ┆ description │\n",
+ "│ --- ┆ --- │\n",
+ "│ str ┆ str │\n",
+ "╞══════════╪═════════════════════════╡\n",
+ "│ A ┆ Long description text 0 │\n",
+ "│ B ┆ Long description text 1 │\n",
+ "│ A ┆ Long description text 2 │\n",
+ "│ B ┆ Long description text 3 │\n",
+ "│ A ┆ Long description text 4 │\n",
+ "│ A ┆ Long description text 5 │\n",
+ "│ B ┆ Long description text 6 │\n",
+ "│ A ┆ Long description text 7 │\n",
+ "│ B ┆ Long description text 8 │\n",
+ "│ A ┆ Long description text 9 │\n",
+ "└──────────┴─────────────────────────┘"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_strings = pd.DataFrame({\n",
+ " \"category\": [\"A\", \"B\", \"A\", \"B\", \"A\"] * 2, # Low cardinality\n",
+ " \"description\": [f\"Long description text {i}\" for i in range(10)] # High cardinality\n",
+ "})\n",
+ "lib.write(\"strings\", df_strings)\n",
+ "\n",
+ "# Default behavior (LARGE_STRING)\n",
+ "polars_df = lib.read(\"strings\", output_format=OutputFormat.POLARS).data\n",
+ "print(f\"Dtypes: {polars_df.dtypes}\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "44fae83a",
+ "metadata": {},
+ "source": [
+ "## Example: CATEGORICAL Format for Low Cardinality\n",
+ "\n",
+ "Use `CATEGORICAL` when you have few unique values repeated many times:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "c8015891",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dtypes: [Categorical, Categorical]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (10, 2)| category | description |
|---|
| cat | cat |
| "A" | "Long description text 0" |
| "B" | "Long description text 1" |
| "A" | "Long description text 2" |
| "B" | "Long description text 3" |
| "A" | "Long description text 4" |
| "A" | "Long description text 5" |
| "B" | "Long description text 6" |
| "A" | "Long description text 7" |
| "B" | "Long description text 8" |
| "A" | "Long description text 9" |
"
+ ],
+ "text/plain": [
+ "shape: (10, 2)\n",
+ "┌──────────┬─────────────────────────┐\n",
+ "│ category ┆ description │\n",
+ "│ --- ┆ --- │\n",
+ "│ cat ┆ cat │\n",
+ "╞══════════╪═════════════════════════╡\n",
+ "│ A ┆ Long description text 0 │\n",
+ "│ B ┆ Long description text 1 │\n",
+ "│ A ┆ Long description text 2 │\n",
+ "│ B ┆ Long description text 3 │\n",
+ "│ A ┆ Long description text 4 │\n",
+ "│ A ┆ Long description text 5 │\n",
+ "│ B ┆ Long description text 6 │\n",
+ "│ A ┆ Long description text 7 │\n",
+ "│ B ┆ Long description text 8 │\n",
+ "│ A ┆ Long description text 9 │\n",
+ "└──────────┴─────────────────────────┘"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Set categorical for all string columns\n",
+ "polars_df_cat = lib.read(\n",
+ " \"strings\",\n",
+ " output_format=OutputFormat.POLARS,\n",
+ " arrow_string_format_default=ArrowOutputStringFormat.CATEGORICAL\n",
+ ").data\n",
+ "print(f\"Dtypes: {polars_df_cat.dtypes}\")\n",
+ "polars_df_cat"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4663a9e3",
+ "metadata": {},
+ "source": [
+ "## Per-Column String Format Configuration\n",
+ "\n",
+ "Optimize each column individually:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "ab0633b9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dtypes: [Categorical, String]\n",
+ "Category is Categorical: True\n",
+ "Description is String: True\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Categorical for low cardinality, LARGE_STRING for high cardinality\n",
+ "polars_df_mixed = lib.read(\n",
+ " \"strings\",\n",
+ " output_format=OutputFormat.POLARS,\n",
+ " arrow_string_format_per_column={\n",
+ " \"category\": ArrowOutputStringFormat.CATEGORICAL,\n",
+ " \"description\": ArrowOutputStringFormat.LARGE_STRING\n",
+ " }\n",
+ ").data\n",
+ "print(f\"Dtypes: {polars_df_mixed.dtypes}\")\n",
+ "print(f\"Category is Categorical: {polars_df_mixed['category'].dtype == pl.Categorical}\")\n",
+ "print(f\"Description is String: {polars_df_mixed['description'].dtype == pl.String}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eef6090c",
+ "metadata": {},
+ "source": [
+ "# Pandas Interoperability and Column Naming\n",
+ "\n",
+ "Since ArcticDB currently only supports writing Pandas DataFrames, understanding how Pandas metadata translates to Arrow/Polars is important. ArcticDB attaches normalization metadata to enable seamless round-trip conversion."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e1178eae",
+ "metadata": {},
+ "source": [
+ "## Index Column Naming: `__index__`\n",
+ "\n",
+ "When Pandas DataFrames have unnamed indexes, PyArrow/Polars output formats use the special column name `__index__`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "fde6f9fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Columns: ['__index__', 'value']\n",
+ "First column name: '__index__'\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (3, 2)| __index__ | value |
|---|
| datetime[ns] | i64 |
| 2025-01-01 00:00:00 | 1 |
| 2025-01-02 00:00:00 | 2 |
| 2025-01-03 00:00:00 | 3 |
"
+ ],
+ "text/plain": [
+ "shape: (3, 2)\n",
+ "┌─────────────────────┬───────┐\n",
+ "│ __index__ ┆ value │\n",
+ "│ --- ┆ --- │\n",
+ "│ datetime[ns] ┆ i64 │\n",
+ "╞═════════════════════╪═══════╡\n",
+ "│ 2025-01-01 00:00:00 ┆ 1 │\n",
+ "│ 2025-01-02 00:00:00 ┆ 2 │\n",
+ "│ 2025-01-03 00:00:00 ┆ 3 │\n",
+ "└─────────────────────┴───────┘"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# DataFrame with unnamed DatetimeIndex\n",
+ "df_unnamed_idx = pd.DataFrame(\n",
+ " {\"value\": [1, 2, 3]},\n",
+ " index=pd.date_range(\"2025-01-01\", periods=3)\n",
+ ")\n",
+ "lib.write(\"unnamed_idx\", df_unnamed_idx)\n",
+ "\n",
+ "polars_df = lib.read(\"unnamed_idx\", output_format=OutputFormat.POLARS).data\n",
+ "print(f\"Columns: {polars_df.columns}\")\n",
+ "print(f\"First column name: '{polars_df.columns[0]}'\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "201076be",
+ "metadata": {},
+ "source": [
+ "## Named Indexes Preserved\n",
+ "\n",
+ "Named indexes retain their names as column names:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "d01b5e8f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "First column: 'timestamp' (the named index)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (3, 3)| timestamp | col_int | col_float |
|---|
| datetime[ns] | i64 | f64 |
| 2025-01-01 00:00:00 | 0 | 10.0 |
| 2025-01-02 00:00:00 | 1 | 11.0 |
| 2025-01-03 00:00:00 | 2 | 12.0 |
"
+ ],
+ "text/plain": [
+ "shape: (3, 3)\n",
+ "┌─────────────────────┬─────────┬───────────┐\n",
+ "│ timestamp ┆ col_int ┆ col_float │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ datetime[ns] ┆ i64 ┆ f64 │\n",
+ "╞═════════════════════╪═════════╪═══════════╡\n",
+ "│ 2025-01-01 00:00:00 ┆ 0 ┆ 10.0 │\n",
+ "│ 2025-01-02 00:00:00 ┆ 1 ┆ 11.0 │\n",
+ "│ 2025-01-03 00:00:00 ┆ 2 ┆ 12.0 │\n",
+ "└─────────────────────┴─────────┴───────────┘"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# We already wrote \"timeseries\" with a named index \"timestamp\"\n",
+ "polars_df = lib.read(\"timeseries\", output_format=OutputFormat.POLARS).data\n",
+ "print(f\"First column: '{polars_df.columns[0]}' (the named index)\")\n",
+ "polars_df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7a6aa61d",
+ "metadata": {},
+ "source": [
+ "## MultiIndex Handling\n",
+ "\n",
+ "MultiIndex levels become separate columns:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "b4d9c5db",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Columns: ['date', 'ticker', 'price']\n",
+ "MultiIndex levels became regular columns\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (4, 3)| date | ticker | price |
|---|
| str | str | i64 |
| "2025-01-01" | "AAPL" | 100 |
| "2025-01-01" | "GOOGL" | 101 |
| "2025-01-02" | "AAPL" | 102 |
| "2025-01-02" | "GOOGL" | 103 |
"
+ ],
+ "text/plain": [
+ "shape: (4, 3)\n",
+ "┌────────────┬────────┬───────┐\n",
+ "│ date ┆ ticker ┆ price │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ i64 │\n",
+ "╞════════════╪════════╪═══════╡\n",
+ "│ 2025-01-01 ┆ AAPL ┆ 100 │\n",
+ "│ 2025-01-01 ┆ GOOGL ┆ 101 │\n",
+ "│ 2025-01-02 ┆ AAPL ┆ 102 │\n",
+ "│ 2025-01-02 ┆ GOOGL ┆ 103 │\n",
+ "└────────────┴────────┴───────┘"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_multi = pd.DataFrame({\n",
+ " \"price\": [100, 101, 102, 103]\n",
+ "}, index=pd.MultiIndex.from_product([[\"2025-01-01\", \"2025-01-02\"], [\"AAPL\", \"GOOGL\"]], names=[\"date\", \"ticker\"]))\n",
+ "lib.write(\"multiindex\", df_multi)\n",
+ "\n",
+ "polars_df = lib.read(\"multiindex\", output_format=OutputFormat.POLARS).data\n",
+ "print(f\"Columns: {polars_df.columns}\")\n",
+ "print(\"MultiIndex levels became regular columns\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53a8dfa3",
+ "metadata": {},
+ "source": [
+ "Unnamed MultiIndex columns get displayed as `\"__index_level_0__\"`, `\"__index_level_1__\"`, etc."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "d48ff8df",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Columns: ['__index_level_0__', '__index_level_1__', 'price']\n",
+ "MultiIndex levels became regular columns and use special names when unnamed\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (4, 3)| __index_level_0__ | __index_level_1__ | price |
|---|
| str | str | i64 |
| "2025-01-01" | "AAPL" | 100 |
| "2025-01-01" | "GOOGL" | 101 |
| "2025-01-02" | "AAPL" | 102 |
| "2025-01-02" | "GOOGL" | 103 |
"
+ ],
+ "text/plain": [
+ "shape: (4, 3)\n",
+ "┌───────────────────┬───────────────────┬───────┐\n",
+ "│ __index_level_0__ ┆ __index_level_1__ ┆ price │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ i64 │\n",
+ "╞═══════════════════╪═══════════════════╪═══════╡\n",
+ "│ 2025-01-01 ┆ AAPL ┆ 100 │\n",
+ "│ 2025-01-01 ┆ GOOGL ┆ 101 │\n",
+ "│ 2025-01-02 ┆ AAPL ┆ 102 │\n",
+ "│ 2025-01-02 ┆ GOOGL ┆ 103 │\n",
+ "└───────────────────┴───────────────────┴───────┘"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_multi = pd.DataFrame({\n",
+ " \"price\": [100, 101, 102, 103]\n",
+ "}, index=pd.MultiIndex.from_product([[\"2025-01-01\", \"2025-01-02\"], [\"AAPL\", \"GOOGL\"]]))\n",
+ "lib.write(\"multiindex\", df_multi)\n",
+ "\n",
+ "polars_df = lib.read(\"multiindex\", output_format=OutputFormat.POLARS).data\n",
+ "print(f\"Columns: {polars_df.columns}\")\n",
+ "print(\"MultiIndex levels became regular columns and use special names when unnamed\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9157ac73",
+ "metadata": {},
+ "source": [
+ "## Round-Trip Conversion\n",
+ "\n",
+ "Converting to PyArrow and back preserves Pandas metadata:\n",
+ "\n",
+ "*Note that Polars does not have a concept of Pandas metadata and can't be round tripped to pandas without loosing index metadata.*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "6552652d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index restored: \n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " col1 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " col1\n",
+ "0 0\n",
+ "1 1\n",
+ "2 2\n",
+ "3 3\n",
+ "4 4"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Default RangeIndex is restored\n",
+ "df_simple = pd.DataFrame({\"col1\": np.arange(5)})\n",
+ "lib.write(\"simple_pandas\", df_simple)\n",
+ "\n",
+ "arrow_table = lib.read(\"simple_pandas\", output_format=OutputFormat.PYARROW).data\n",
+ "pandas_restored = arrow_table.to_pandas()\n",
+ "print(f\"Index restored: {type(pandas_restored.index)}\")\n",
+ "pandas_restored"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "af571178",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "MultiIndex restored: \n",
+ "Index names: [None, None]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 2025-01-01 | \n",
+ " AAPL | \n",
+ " 100 | \n",
+ "
\n",
+ " \n",
+ " | GOOGL | \n",
+ " 101 | \n",
+ "
\n",
+ " \n",
+ " | 2025-01-02 | \n",
+ " AAPL | \n",
+ " 102 | \n",
+ "
\n",
+ " \n",
+ " | GOOGL | \n",
+ " 103 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " price\n",
+ "2025-01-01 AAPL 100\n",
+ " GOOGL 101\n",
+ "2025-01-02 AAPL 102\n",
+ " GOOGL 103"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# MultiIndex is also restored correctly\n",
+ "arrow_table = lib.read(\"multiindex\", output_format=OutputFormat.PYARROW).data\n",
+ "pandas_restored = arrow_table.to_pandas()\n",
+ "print(f\"MultiIndex restored: {type(pandas_restored.index)}\")\n",
+ "print(f\"Index names: {pandas_restored.index.names}\")\n",
+ "pandas_restored"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "1b08909f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Type: \n",
+ "Note: Index information is lost when converting from Polars to Pandas\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " timestamp | \n",
+ " col_int | \n",
+ " col_float | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2025-01-01 | \n",
+ " 0 | \n",
+ " 10.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2025-01-02 | \n",
+ " 1 | \n",
+ " 11.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2025-01-03 | \n",
+ " 2 | \n",
+ " 12.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2025-01-04 | \n",
+ " 3 | \n",
+ " 13.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2025-01-05 | \n",
+ " 4 | \n",
+ " 14.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " timestamp col_int col_float\n",
+ "0 2025-01-01 0 10.0\n",
+ "1 2025-01-02 1 11.0\n",
+ "2 2025-01-03 2 12.0\n",
+ "3 2025-01-04 3 13.0\n",
+ "4 2025-01-05 4 14.0"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "polars_df = lib.read(\"timeseries\", output_format=OutputFormat.POLARS).data\n",
+ "pandas_from_polars = polars_df.to_pandas()\n",
+ "print(f\"Type: {type(pandas_from_polars)}\")\n",
+ "print(\"Note: Index information is lost when converting from Polars to Pandas\")\n",
+ "pandas_from_polars.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "26e50f31-7545-4a85-a15f-1b81548ebb7f",
+ "metadata": {},
+ "source": [
+ "# Working with Dynamic Schema\n",
+ "\n",
+ "Dynamic schema libraries work seamlessly with Arrow output formats, including processing operations."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "572d3934-bc87-4b27-9a2c-1abd739ab4bd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Promoted dtype: Int16\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ "shape: (6, 1)\n",
+ "┌──────┐\n",
+ "│ col1 │\n",
+ "│ --- │\n",
+ "│ i16 │\n",
+ "╞══════╡\n",
+ "│ 0 │\n",
+ "│ 1 │\n",
+ "│ 2 │\n",
+ "│ 3 │\n",
+ "│ 4 │\n",
+ "│ 5 │\n",
+ "└──────┘"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Create a dynamic schema library\n",
+ "ac.delete_library(\"arrow_dynamic\")\n",
+ "lib_dyn = ac.create_library(\"arrow_dynamic\", LibraryOptions(dynamic_schema=True))\n",
+ "\n",
+ "## Type Promotion\n",
+ "\n",
+ "df_0 = pd.DataFrame({\"col1\": np.arange(3, dtype=np.uint8)})\n",
+ "df_1 = pd.DataFrame({\"col1\": np.arange(3, 6, dtype=np.int16)})\n",
+ "lib_dyn.write(\"type_promo\", df_0)\n",
+ "lib_dyn.append(\"type_promo\", df_1)\n",
+ "\n",
+ "polars_df = lib_dyn.read(\"type_promo\", output_format=OutputFormat.POLARS).data\n",
+ "print(f\"Promoted dtype: {polars_df['col1'].dtype}\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "33bd8187-7598-4ae1-aa5a-4f635a5d83d4",
+ "metadata": {},
+ "source": [
+ "## Missing Columns\n",
+ "\n",
+ "Columns missing from some segments are backfilled with nulls:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "4a80d248-549c-4cd8-8b36-c0a082fa8055",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Missing columns filled with nulls:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (4, 2)| col_a | col_b |
|---|
| i64 | str |
| 1 | null |
| 2 | null |
| null | "x" |
| null | "y" |
"
+ ],
+ "text/plain": [
+ "shape: (4, 2)\n",
+ "┌───────┬───────┐\n",
+ "│ col_a ┆ col_b │\n",
+ "│ --- ┆ --- │\n",
+ "│ i64 ┆ str │\n",
+ "╞═══════╪═══════╡\n",
+ "│ 1 ┆ null │\n",
+ "│ 2 ┆ null │\n",
+ "│ null ┆ x │\n",
+ "│ null ┆ y │\n",
+ "└───────┴───────┘"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_0 = pd.DataFrame({\"col_a\": [1, 2]})\n",
+ "df_1 = pd.DataFrame({\"col_b\": [\"x\", \"y\"]})\n",
+ "lib_dyn.write(\"missing_cols\", df_0)\n",
+ "lib_dyn.append(\"missing_cols\", df_1)\n",
+ "\n",
+ "polars_df = lib_dyn.read(\"missing_cols\", output_format=OutputFormat.POLARS).data\n",
+ "print(\"Missing columns filled with nulls:\")\n",
+ "polars_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "69e0d041",
+ "metadata": {},
+ "source": [
+ "## Processing with Dynamic Schema\n",
+ "\n",
+ "Processing operations work with both static and dynamic schema libraries.\n",
+ "All missing values from processing are filled with nulls:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "74c6cb24",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Result type: \n",
+ "Filtered rows: 12\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (12, 4)| value | float_1 | float_2 | combined_float |
|---|
| i64 | f64 | f64 | f64 |
| -1 | 0.0 | null | 0.0 |
| 2 | 1.0 | null | null |
| -1 | 3.0 | null | 3.0 |
| 2 | 4.0 | null | null |
| -1 | 6.0 | null | 6.0 |
| … | … | … | … |
| 8 | null | 1.0 | 1.0 |
| -5 | null | 3.0 | null |
| 8 | null | 4.0 | 4.0 |
| -5 | null | 6.0 | null |
| 8 | null | 7.0 | 7.0 |
"
+ ],
+ "text/plain": [
+ "shape: (12, 4)\n",
+ "┌───────┬─────────┬─────────┬────────────────┐\n",
+ "│ value ┆ float_1 ┆ float_2 ┆ combined_float │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ f64 ┆ f64 ┆ f64 │\n",
+ "╞═══════╪═════════╪═════════╪════════════════╡\n",
+ "│ -1 ┆ 0.0 ┆ null ┆ 0.0 │\n",
+ "│ 2 ┆ 1.0 ┆ null ┆ null │\n",
+ "│ -1 ┆ 3.0 ┆ null ┆ 3.0 │\n",
+ "│ 2 ┆ 4.0 ┆ null ┆ null │\n",
+ "│ -1 ┆ 6.0 ┆ null ┆ 6.0 │\n",
+ "│ … ┆ … ┆ … ┆ … │\n",
+ "│ 8 ┆ null ┆ 1.0 ┆ 1.0 │\n",
+ "│ -5 ┆ null ┆ 3.0 ┆ null │\n",
+ "│ 8 ┆ null ┆ 4.0 ┆ 4.0 │\n",
+ "│ -5 ┆ null ┆ 6.0 ┆ null │\n",
+ "│ 8 ┆ null ┆ 7.0 ┆ 7.0 │\n",
+ "└───────┴─────────┴─────────┴────────────────┘"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_proc_1 = pd.DataFrame({\"value\": [-1, 2, 10] * 3, \"float_1\": np.arange(9, dtype=np.float64)})\n",
+ "df_proc_2 = pd.DataFrame({\"value\": [-5, 8, 20] * 3, \"float_2\": np.arange(9, dtype=np.float64)})\n",
+ "lib_dyn.write(\"missing_cols\", df_proc_1)\n",
+ "lib_dyn.append(\"missing_cols\", df_proc_2)\n",
+ "\n",
+ "# Use lazy processing with Polars output\n",
+ "lazy_df = lib_dyn.read(\"missing_cols\", lazy=True, output_format=OutputFormat.POLARS)\n",
+ "lazy_df = lazy_df[lazy_df[\"value\"] < 10] # Filter based on column values\n",
+ "lazy_df[\"combined_float\"] = where(lazy_df[\"value\"] < 0, lazy_df[\"float_1\"], lazy_df[\"float_2\"]) # Project a column with interleaved nulls\n",
+ "result = lazy_df.collect().data\n",
+ "\n",
+ "print(f\"Result type: {type(result)}\")\n",
+ "print(f\"Filtered rows: {len(result)}\")\n",
+ "result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfd7ed36",
+ "metadata": {},
+ "source": [
+ "# Performance Benchmarks\n",
+ "\n",
+ "Arrow-based formats provide performance improvements for string-heavy data (because with Arrow we don't need to take the GIL).\n",
+ "\n",
+ "For numeric data arrow based formats are in line with pandas performance."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "83751c3c",
+ "metadata": {},
+ "source": [
+ "## Benchmark: Numeric Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "1bcb7858",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Pandas: 466.19 ms\n",
+ "Polars: 441.10 ms\n",
+ "Speedup: 1.06x\n"
+ ]
+ }
+ ],
+ "source": [
+ "import timeit\n",
+ "\n",
+ "# Create numeric dataset\n",
+ "df_numeric = pd.DataFrame({\n",
+ " f\"col_{i}\": np.random.randn(10_000_000) for i in range(10)\n",
+ "})\n",
+ "lib.write(\"bench_numeric\", df_numeric)\n",
+ "\n",
+ "# Benchmark\n",
+ "pandas_time = timeit.timeit(\n",
+ " lambda: lib.read(\"bench_numeric\", output_format=OutputFormat.PANDAS).data,\n",
+ " number=10\n",
+ ") / 10\n",
+ "\n",
+ "polars_time = timeit.timeit(\n",
+ " lambda: lib.read(\"bench_numeric\", output_format=OutputFormat.POLARS).data,\n",
+ " number=10\n",
+ ") / 10\n",
+ "\n",
+ "print(f\"Pandas: {pandas_time*1000:.2f} ms\")\n",
+ "print(f\"Polars: {polars_time*1000:.2f} ms\")\n",
+ "print(f\"Speedup: {pandas_time/polars_time:.2f}x\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "90c77511",
+ "metadata": {},
+ "source": [
+ "## Benchmark: String Data\n",
+ "\n",
+ "String performance shows big improvements (no GIL required for Arrow):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "dece965c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Pandas: 1662.48 ms\n",
+ "Polars: 694.08 ms\n",
+ "Speedup: 2.40x\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "df_strings = pd.DataFrame({\n",
+ " f\"col_{i}\": np.random.randn(1_000_000).astype(str) for i in range(10)\n",
+ "})\n",
+ "lib.write(\"bench_strings\", df_strings)\n",
+ "\n",
+ "# Benchmark\n",
+ "pandas_time = timeit.timeit(\n",
+ " lambda: lib.read(\"bench_strings\", output_format=OutputFormat.PANDAS).data,\n",
+ " number=10\n",
+ ") / 10\n",
+ "\n",
+ "polars_time = timeit.timeit(\n",
+ " lambda: lib.read(\"bench_strings\", output_format=OutputFormat.POLARS).data,\n",
+ " number=10\n",
+ ") / 10\n",
+ "\n",
+ "print(f\"Pandas: {pandas_time*1000:.2f} ms\")\n",
+ "print(f\"Polars: {polars_time*1000:.2f} ms\")\n",
+ "print(f\"Speedup: {pandas_time/polars_time:.2f}x\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "311-adb-editable",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/.asv/results/benchmarks.json b/python/.asv/results/benchmarks.json
index 0c58f1e39b..1409507a27 100644
--- a/python/.asv/results/benchmarks.json
+++ b/python/.asv/results/benchmarks.json
@@ -1,6 +1,6 @@
{
"arrow.ArrowNumeric.peakmem_read": {
- "code": "class ArrowNumeric:\n def peakmem_read(self, rows, date_range):\n self.lib.read(self.symbol_name(rows), date_range=self.date_range)\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
+ "code": "class ArrowNumeric:\n def peakmem_read(self, rows, date_range):\n self.lib.read(self.symbol_name(rows), date_range=self.date_range)\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
"name": "arrow.ArrowNumeric.peakmem_read",
"param_names": [
"rows",
@@ -20,10 +20,10 @@
"timeout": 6000,
"type": "peakmemory",
"unit": "bytes",
- "version": "67ab80e297890f213706b5160255c969b0aee853adfd6ae53655c45b7e845f85"
+ "version": "aee945c1358b8a4c2dd274815ceb80a3cfaa085451149386f6674020b904feed"
},
"arrow.ArrowNumeric.peakmem_write": {
- "code": "class ArrowNumeric:\n def peakmem_write(self, rows, date_range):\n self.fresh_lib.write(f\"sym_{rows}\", self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
+ "code": "class ArrowNumeric:\n def peakmem_write(self, rows, date_range):\n self.fresh_lib.write(f\"sym_{rows}\", self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
"name": "arrow.ArrowNumeric.peakmem_write",
"param_names": [
"rows",
@@ -43,10 +43,10 @@
"timeout": 6000,
"type": "peakmemory",
"unit": "bytes",
- "version": "b28a54936bacb902a56cad0b9d235bd3e13ac04ac37687cf265c5a987b7ea2d1"
+ "version": "87b2ccbc5cf2ef33d45f934f653aa03f853a30fbadfa4710abb2018124b53386"
},
"arrow.ArrowNumeric.time_read": {
- "code": "class ArrowNumeric:\n def time_read(self, rows, date_range):\n self.lib.read(self.symbol_name(rows), date_range=self.date_range)\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
+ "code": "class ArrowNumeric:\n def time_read(self, rows, date_range):\n self.lib.read(self.symbol_name(rows), date_range=self.date_range)\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
"min_run_count": 2,
"name": "arrow.ArrowNumeric.time_read",
"number": 20,
@@ -71,11 +71,11 @@
"timeout": 6000,
"type": "time",
"unit": "seconds",
- "version": "ecb51f067ac7d40c4b9c9df11fa1a0b1f3c682a14f1ee974cf8c3eb9fcfd86d7",
+ "version": "aaf529494b84152ccaca0f605f0f2a9bfad037367e50e96ae5d906ce6b26741c",
"warmup_time": 0
},
"arrow.ArrowNumeric.time_write": {
- "code": "class ArrowNumeric:\n def time_write(self, rows, date_range):\n self.fresh_lib.write(f\"sym_{rows}\", self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
+ "code": "class ArrowNumeric:\n def time_write(self, rows, date_range):\n self.fresh_lib.write(f\"sym_{rows}\", self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
"min_run_count": 2,
"name": "arrow.ArrowNumeric.time_write",
"number": 20,
@@ -100,11 +100,11 @@
"timeout": 6000,
"type": "time",
"unit": "seconds",
- "version": "b0a49f584f44c8451a2fda903df76351a10368b81d639239cfa3023e9dee737b",
+ "version": "3511ddd99480aff7cc0c4c13d2809e213c523fdbe74137ce79f7483297e5686f",
"warmup_time": 0
},
"arrow.ArrowStrings.peakmem_read": {
- "code": "class ArrowStrings:\n def peakmem_read(self, rows, date_range, unique_string_count, arrow_string_format):\n self.lib.read(\n self.symbol_name(rows, unique_string_count),\n date_range=self.date_range,\n arrow_string_format_default=arrow_string_format,\n )\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
+ "code": "class ArrowStrings:\n def peakmem_read(self, rows, date_range, unique_string_count, arrow_string_format):\n self.lib.read(\n self.symbol_name(rows, unique_string_count),\n date_range=self.date_range,\n arrow_string_format_default=arrow_string_format,\n )\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
"name": "arrow.ArrowStrings.peakmem_read",
"param_names": [
"rows",
@@ -137,10 +137,10 @@
"timeout": 6000,
"type": "peakmemory",
"unit": "bytes",
- "version": "a273a5f3619b20f832ece06f1ee4650c4c4bb76e6de07702968a2098d39a1bab"
+ "version": "5bd5060492fc565ecb99ad5215bf454d71ca4cfedcd6f970efd61e04bec1fb7d"
},
"arrow.ArrowStrings.peakmem_write": {
- "code": "class ArrowStrings:\n def peakmem_write(self, rows, date_range, unique_string_count, arrow_string_format):\n # No point in running with all read time options\n if date_range is None and arrow_string_format == ArrowOutputStringFormat.CATEGORICAL:\n self.fresh_lib.write(self.symbol_name(rows, unique_string_count), self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
+ "code": "class ArrowStrings:\n def peakmem_write(self, rows, date_range, unique_string_count, arrow_string_format):\n # No point in running with all read time options\n if date_range is None and arrow_string_format == ArrowOutputStringFormat.CATEGORICAL:\n self.fresh_lib.write(self.symbol_name(rows, unique_string_count), self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
"name": "arrow.ArrowStrings.peakmem_write",
"param_names": [
"rows",
@@ -173,10 +173,10 @@
"timeout": 6000,
"type": "peakmemory",
"unit": "bytes",
- "version": "d1fa54f3f898dac7bbb27a41353877938711c2470afc8d5ad170bd8583f352d4"
+ "version": "d473ffa788d420e51973cdce360a1537aee61660dd9e06937a494ac93cc88597"
},
"arrow.ArrowStrings.time_read": {
- "code": "class ArrowStrings:\n def time_read(self, rows, date_range, unique_string_count, arrow_string_format):\n self.lib.read(\n self.symbol_name(rows, unique_string_count),\n date_range=self.date_range,\n arrow_string_format_default=arrow_string_format,\n )\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
+ "code": "class ArrowStrings:\n def time_read(self, rows, date_range, unique_string_count, arrow_string_format):\n self.lib.read(\n self.symbol_name(rows, unique_string_count),\n date_range=self.date_range,\n arrow_string_format_default=arrow_string_format,\n )\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
"min_run_count": 2,
"name": "arrow.ArrowStrings.time_read",
"number": 20,
@@ -214,11 +214,11 @@
"timeout": 6000,
"type": "time",
"unit": "seconds",
- "version": "8b5591c39110f8a625f84894f4f49f6e757449018c652c5644b63f94a4ea95ea",
+ "version": "b1977c6215c99d47d950dc30637270fadbf49c504f388ef9360eba4ea558a0ab",
"warmup_time": 0
},
"arrow.ArrowStrings.time_write": {
- "code": "class ArrowStrings:\n def time_write(self, rows, date_range, unique_string_count, arrow_string_format):\n # No point in running with all read time options\n if date_range is None and arrow_string_format == ArrowOutputStringFormat.CATEGORICAL:\n self.fresh_lib.write(self.symbol_name(rows, unique_string_count), self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
+ "code": "class ArrowStrings:\n def time_write(self, rows, date_range, unique_string_count, arrow_string_format):\n # No point in running with all read time options\n if date_range is None and arrow_string_format == ArrowOutputStringFormat.CATEGORICAL:\n self.fresh_lib.write(self.symbol_name(rows, unique_string_count), self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")",
"min_run_count": 2,
"name": "arrow.ArrowStrings.time_write",
"number": 20,
@@ -256,7 +256,7 @@
"timeout": 6000,
"type": "time",
"unit": "seconds",
- "version": "05391bc18c869bb06480f5117f87ac7cc6a90d50acd8405162c9e935f0de14b0",
+ "version": "13495c9c867964a946faa9f0877d374bafcae7e1a524eacefb708a7f8fa10e92",
"warmup_time": 0
},
"basic_functions.BasicFunctions.peakmem_read": {
diff --git a/python/arcticdb/arctic.py b/python/arcticdb/arctic.py
index 053dff77c1..2849b41e31 100644
--- a/python/arcticdb/arctic.py
+++ b/python/arcticdb/arctic.py
@@ -81,17 +81,14 @@ def __init__(
Can be overridden by specifying the encoding version in the LibraryOptions argument to create_library.
output_format: Union[OutputFormat, str], default = OutputFormat.PANDAS
- Controls the default output format of all operations returning a dataframe.
- The default behavior (OutputFormat.PANDAS) is to return `pandas.DataFrame`s or `pandas.Series` backed by
- numpy arrays.
- OutputFormat.EXPERIMENTAL_ARROW will return all dataframes as `pyarrow.Table`s. The arrow API is still
- experimental and the arrow layout might change in a minor release.
- Accepts the OutputFormat as either OutputFormat enum values or as case-insensitive strings like "pandas"
- and "experimental_arrow".
-
- arrow_string_format_default: Union[ArrowOutputStringFormat, "pa.DataType"] = ArrowOutputStringFormat.LARGE_STRING
- Controls the default string format used for `OutputFormat.EXPERIMENTAL_ARROW`.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
+ Default output format for all read operations on libraries created from this `Arctic` instance.
+ Can be overridden per library or per read operation.
+ See `OutputFormat` documentation for details on available formats.
+
+ arrow_string_format_default: Union[ArrowOutputStringFormat, "pa.DataType"], default = ArrowOutputStringFormat.LARGE_STRING
+ Default string column format when using `PYARROW` or `POLARS` output formats.
+ Can be overridden per library or per read operation.
+ See `ArrowOutputStringFormat` documentation for details on available string formats.
Examples
--------
@@ -196,14 +193,16 @@ def get_library(
Unused if create_if_missing is False.
output_format: Optional[Union[OutputFormat, str]], default = None
- Controls the default output format of all operations on the library returning a dataframe.
- For more information see documentation of `Arctic.__init__`.
- If `None` uses the output format from the Arctic instance.
+ Default output format for all read operations on this library.
+ If `None`, uses the output format from the `Arctic` instance.
+ Can be overridden per read operation.
+ See `OutputFormat` documentation for details on available formats.
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
- If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- If `None` uses the default arrow_string_format from the `Library` instance.
+ Default string column format when using `PYARROW` or `POLARS` output formats on this library.
+ If `None`, uses the `arrow_string_format_default` from the `Arctic` instance.
+ Can be overridden per read operation.
+ See `ArrowOutputStringFormat` documentation for details on available string formats.
Examples
--------
@@ -268,14 +267,17 @@ def create_library(
EnterpriseLibraryOptions. These options are only relevant to ArcticDB enterprise users.
output_format: Optional[Union[OutputFormat, str]], default = None
- Controls the default output format of all operations on the library returning a dataframe.
- For more information see documentation of `Arctic.__init__`.
- If `None` uses the output format from the Arctic instance.
+ Default output format for all read operations on this library.
+ If `None`, uses the output format from the `Arctic` instance.
+ Can be overridden per read operation.
+ See `OutputFormat` documentation for details on available formats.
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
- If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- If `None` uses the default arrow_string_format from the `Library` instance.
+ Default string column format when using `PYARROW` or `POLARS` output formats on this library.
+ If `None`, uses the `arrow_string_format_default` from the `Arctic` instance.
+ Can be overridden per read operation.
+ See `ArrowOutputStringFormat` documentation for details on available string formats.
+ Note that this setting is only applied to the runtime `Library` instance and is not stored as part of the library configuration.
Examples
--------
diff --git a/python/arcticdb/options.py b/python/arcticdb/options.py
index 0b22077c80..66899aac0c 100644
--- a/python/arcticdb/options.py
+++ b/python/arcticdb/options.py
@@ -169,21 +169,40 @@ def __repr__(self):
# TODO: Use enum.StrEnum when we no longer need to support python 3.9
class OutputFormat(str, Enum):
+ """
+ Controls the output format of operations which return dataframes. All APIs which take an `output_format` argument
+ accept the enum values and case-insensitive strings. E.g. all of `OutputFormat.PYARROW`, `"PYARROW"`, `"pyarrow"`
+ will be interpreted as `OutputFormat.PYARROW`.
+
+ PANDAS (default):
+ Dataframes are returned as `pandas.DataFrame` or `pandas.Series` objects backed by numpy arrays.
+
+ PYARROW:
+ Dataframes are returned as `pyarrow.Table` objects using Apache Arrow's columnar memory format.
+ Provides better performance than `PANDAS`, especially for dataframes containing many string columns.
+ String format can be customized via `ArrowOutputStringFormat`.
+
+ POLARS:
+ Dataframes are returned as `polars.DataFrame` objects using Apache Arrow's columnar memory format.
+ Provides better performance than `PANDAS`, especially for dataframes containing many string columns.
+ String format can be customized via `ArrowOutputStringFormat`.
+ """
+
PANDAS = "PANDAS"
- EXPERIMENTAL_ARROW = "EXPERIMENTAL_ARROW"
- EXPERIMENTAL_POLARS = "EXPERIMENTAL_POLARS"
+ PYARROW = "PYARROW"
+ POLARS = "POLARS"
def output_format_to_internal(output_format: Union[OutputFormat, str]) -> InternalOutputFormat:
if output_format.lower() == OutputFormat.PANDAS.lower():
return InternalOutputFormat.PANDAS
- elif output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower():
+ elif output_format.lower() == OutputFormat.PYARROW.lower():
if not _PYARROW_AVAILABLE:
raise ModuleNotFoundError(
"ArcticDB's pyarrow optional dependency missing but is required to use arrow output format."
)
return InternalOutputFormat.ARROW
- elif output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
+ elif output_format.lower() == OutputFormat.POLARS.lower():
if not _PYARROW_AVAILABLE or not _POLARS_AVAILABLE:
raise ModuleNotFoundError(
"ArcticDB's pyarrow or polars optional dependencies are missing but are required to use polars output format."
@@ -195,24 +214,31 @@ def output_format_to_internal(output_format: Union[OutputFormat, str]) -> Intern
class ArrowOutputStringFormat(str, Enum):
"""
- Used to specify string format when output_format=OutputFormat.EXPERIMENTAL_ARROW.
- Arguments allow specifying either the enum value or the corresponding pyarrow.DataType
+ Controls the string column format when using `PYARROW` or `POLARS` output formats.
+ Accepts either the enum value or the corresponding `pyarrow.DataType`.
LARGE_STRING (default):
- Produces string columns with type `pa.large_string()`. Total length of strings must fit in a 64-bit integer.
- Does not deduplicate strings, so has better performance for columns with many unique strings.
+ Uses 64-bit variable-size encoding.
+ PyArrow: `pa.large_string()`, Polars: `pl.String`
+ Supports up to 2⁶³-1 bytes total string length per Arrow array.
+ Best for general-purpose use and when working with large string data.
SMALL_STRING:
- Produces string columns with type `pa.string()`. Total length of strings must fit in a 32-bit integer.
- Does not deduplicate strings, so has better performance for columns with many unique strings.
- Slightly faster than `LARGE_STRING` but does not work with very long strings.
+ Uses 32-bit variable-size encoding.
+ PyArrow: `pa.string()`, Polars: Not supported
+ Supports up to 2³¹-1 bytes total string length per Arrow array.
+ Only supported with PyArrow because Polars does not support small strings.
+ Slightly more memory efficient than `LARGE_STRING` when string data is known to be small.
CATEGORICAL and DICTIONARY_ENCODED:
- Both are different aliases for the same string format. Produces string columns with type
- `pa.dictionary(pa.int32(), pa.large_string())`. Total length of strings must fit in a 64-bit integer. Splitting in
- record batches guarantees that 32-bit dictionary keys are sufficient.
- Does deduplicate strings, so has better performance for columns with few unique strings.
-
+ Both are aliases for dictionary-encoded strings with int32 indices.
+ PyArrow: `pa.dictionary(pa.int32(), pa.large_string())`, Polars: `pl.Categorical`
+ Best for columns with low cardinality (few unique values repeated many times).
+ Deduplicates strings, reducing memory usage and improving performance when the number of
+ unique values is much smaller than the total number of rows.
+
+ For more details on physical layouts, see the Apache Arrow specification:
+ https://arrow.apache.org/docs/format/Columnar.html
"""
CATEGORICAL = "CATEGORICAL"
@@ -222,7 +248,7 @@ class ArrowOutputStringFormat(str, Enum):
def arrow_output_string_format_to_internal(
- arrow_string_format: Union[ArrowOutputStringFormat, "pa.DataType"],
+ arrow_string_format: Union[ArrowOutputStringFormat, "pa.DataType"], output_format: Union[OutputFormat, str]
) -> InternalArrowOutputStringFormat:
if (
arrow_string_format == ArrowOutputStringFormat.CATEGORICAL
@@ -242,6 +268,10 @@ def arrow_output_string_format_to_internal(
or _PYARROW_AVAILABLE
and arrow_string_format == pa.string()
):
+ if output_format.lower() == OutputFormat.POLARS.lower():
+ raise ValueError(
+ "SMALL_STRING is not supported with POLARS output format. Please use LARGE_STRING instead."
+ )
return InternalArrowOutputStringFormat.SMALL_STRING
else:
raise ValueError(f"Unkown ArrowOutputStringFormat: {arrow_string_format}")
diff --git a/python/arcticdb/util/arrow.py b/python/arcticdb/util/arrow.py
index ce6b9d2964..d0f1375161 100644
--- a/python/arcticdb/util/arrow.py
+++ b/python/arcticdb/util/arrow.py
@@ -25,7 +25,7 @@ def stringify_dictionary_encoded_columns(table, string_type=None):
def convert_arrow_to_pandas_for_tests(table):
"""
- Converts `pa.Table` outputted via `output_format=OutputFormat.EXPERIMENTAL_ARROW` to a `pd.DataFrame` so it would
+ Converts `pa.Table` outputted via `output_format=OutputFormat.PYARROW` to a `pd.DataFrame` so it would
be identical to the one outputted via `output_format=OutputFormat.PANDAS`. This requires the following changes:
- Replaces dictionary encoded string columns with regular string columns.
- Fills null values in int columns with zeros.
diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py
index ad8ffe7c88..a926291fff 100644
--- a/python/arcticdb/version_store/_normalization.py
+++ b/python/arcticdb/version_store/_normalization.py
@@ -792,7 +792,6 @@ def generate_original_column_names():
pandas_meta = norm_meta.df.common
elif input_type == "series":
# For pandas series we always return a dataframe (to not lose the index information).
- # TODO: Return a `pyarrow.Array` if index is not physically stored (Monday ref: 9360502457)
pandas_meta = norm_meta.series.common
elif input_type == "experimental_arrow":
if norm_meta.experimental_arrow.has_index:
diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py
index f1776517b5..95507b59c5 100644
--- a/python/arcticdb/version_store/_store.py
+++ b/python/arcticdb/version_store/_store.py
@@ -385,7 +385,7 @@ def _set_allow_arrow_input(self, allow_arrow_input: bool = True):
def _set_output_format_for_pipeline_tests(self, output_format):
self.set_output_format(output_format)
- if output_format == OutputFormat.EXPERIMENTAL_ARROW:
+ if output_format == OutputFormat.PYARROW:
self._test_convert_arrow_back_to_pandas = True
@classmethod
@@ -1284,21 +1284,17 @@ def batch_read(
For more information see the documentation for the QueryBuilder class.
i-th entry corresponds to i-th element of `symbols`.
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
- Controls the default string format used for `ARROW` or `POLARS` output format.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- It serves as the default for the entire batch.
- arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None,
- Controls the string format per column used for `ARROW` or `POLARS` output format.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- It is applied to all symbols which don't have a `per_symbol_arrow_string_format_per_column` set.
- per_symbol_arrow_string_format_default: Optional[List[Optional[Union[ArrowOutputStringFormat, "pa.DataType"]]]], default=None,
- Controls the string format per column used for `ARROW` or `POLARS` output format.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- It serves as the default per symbol. It overrides the global `arrow_string_format_default` setting
- per_symbol_arrow_string_format_per_column: Optional[List[Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]]]], default=None,
- Controls the string format per column used for `ARROW` or `POLARS` output format.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- It defines the setting per symbol and per column. It overrides all other string format settings.
+ String column format when using `PYARROW` or `POLARS` output formats. Serves as the default for the entire batch.
+ See `ArrowOutputStringFormat` documentation for details on available string formats.
+ arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None
+ Per-column overrides for `arrow_string_format_default`. Keys are column names.
+ Applied to all symbols without a `per_symbol_arrow_string_format_per_column` set.
+ per_symbol_arrow_string_format_default: Optional[List[Optional[Union[ArrowOutputStringFormat, "pa.DataType"]]]], default=None
+ Per-symbol override for `arrow_string_format_default`. Overrides the batch-level default.
+ i-th entry corresponds to i-th element of `symbols`.
+ per_symbol_arrow_string_format_per_column: Optional[List[Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]]]], default=None
+ Per-symbol, per-column overrides. Takes precedence over all other string format settings.
+ i-th entry corresponds to i-th element of `symbols`.
Examples
--------
@@ -2159,12 +2155,13 @@ def _get_read_options_and_output_format(
proto_cfg,
global_default=ArrowOutputStringFormat.LARGE_STRING,
**kwargs,
- )
+ ),
+ output_format,
)
)
read_options.set_arrow_output_per_column_string_format(
{
- key: arrow_output_string_format_to_internal(value)
+ key: arrow_output_string_format_to_internal(value, output_format)
for key, value in resolve_defaults(
"arrow_string_format_per_column", proto_cfg, global_default={}, **kwargs
).items()
@@ -2728,8 +2725,8 @@ def _adapt_frame_data(self, frame_data, norm, output_format):
)
if self._test_convert_arrow_back_to_pandas:
data = convert_arrow_to_pandas_for_tests(data)
- if output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
- data = pl.from_arrow(data)
+ if output_format.lower() == OutputFormat.POLARS.lower():
+ data = pl.from_arrow(data, rechunk=False)
else:
data = self._normalizer.denormalize(frame_data, norm)
if norm.HasField("custom"):
diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py
index c60fddcd42..5b8ff09b8c 100644
--- a/python/arcticdb/version_store/library.py
+++ b/python/arcticdb/version_store/library.py
@@ -2026,17 +2026,17 @@ def read(
on `LazyDataFrame` for more details.
output_format: Optional[Union[OutputFormat, str]], default=None
- Controls the output format of the result dataframe.
- For more information see documentation of `Arctic.__init__`.
- If `None` uses the default output format from the `Library` instance.
+ Output format for the returned dataframe.
+ If `None`, uses the output format from the `Library` instance.
+ See `OutputFormat` documentation for details on available formats.
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
- If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- If `None` uses the default arrow_string_format from the `Library` instance.
+ String column format when using `PYARROW` or `POLARS` output formats.
+ If `None`, uses the `arrow_string_format_default` from the `Library` instance.
+ See `ArrowOutputStringFormat` documentation for details on available string formats.
arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None
- Provides per column name overrides for `arrow_string_format_default`
+ Per-column overrides for `arrow_string_format_default`. Keys are column names.
Returns
-------
@@ -2063,10 +2063,9 @@ def read(
1 6
2 7
- Passing an output_format can change the resulting dataframe type. E.g. we can use the experimental arrow output
- format:
+ Passing an output_format can change the resulting dataframe type. For example, to return a PyArrow table:
- >>> lib.read("symbol", output_format="EXPERIMENTAL_ARROW").data
+ >>> lib.read("symbol", output_format="PYARROW").data
pyarrow.Table
column: int64
----
@@ -2128,19 +2127,20 @@ def read_batch(
documentation on `LazyDataFrameCollection` for more details.
output_format: Optional[Union[OutputFormat, str]], default=None
- Controls the output format of the result dataframes.
- For more information see documentation of `Arctic.__init__`.
- If `None` uses the default output format from the `Library` instance.
+ Output format for the returned dataframes.
+ If `None`, uses the output format from the `Library` instance.
+ See `OutputFormat` documentation for details on available formats.
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
- If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- It serves as the default for the entire batch. The string format settings inside the `ReadRequest`s will
- override this batch level setting.
+ String column format when using `PYARROW` or `POLARS` output formats.
+ Serves as the default for the entire batch. String format settings in individual `ReadRequest` objects
+ override this batch-level setting.
+ If `None`, uses the `arrow_string_format_default` from the `Library` instance.
+ See `ArrowOutputStringFormat` documentation for details on available string formats.
- arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None,
- Provides per column name overrides for `arrow_string_format_default`. It is only applied to symbols which
- don't have a `arrow_string_format_per_column` set in their `ReadRequest`.
+ arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None
+ Per-column overrides for `arrow_string_format_default`. Keys are column names.
+ Only applied to symbols that don't have `arrow_string_format_per_column` set in their `ReadRequest`.
Returns
-------
@@ -2309,17 +2309,17 @@ def read_batch_and_join(
individual dataframes, and will be applied to the joined data.
output_format: Optional[Union[OutputFormat, str]], default=None
- Controls the output format of the result dataframe.
- For more information see documentation of `Arctic.__init__`.
- If `None` uses the default output format from the `Library` instance.
+ Output format for the returned joined dataframe.
+ If `None`, uses the output format from the `Library` instance.
+ See `OutputFormat` documentation for details on available formats.
arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None
- If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow.
- See documentation of `ArrowOutputStringFormat` for more information on the different options.
- If `None` uses the default arrow_string_format from the `Library` instance.
+ String column format when using `PYARROW` or `POLARS` output formats.
+ If `None`, uses the `arrow_string_format_default` from the `Library` instance.
+ See `ArrowOutputStringFormat` documentation for details on available string formats.
arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None
- Provides per column name overrides for `arrow_string_format_default`
+ Per-column overrides for `arrow_string_format_default`. Keys are column names.
Returns
-------
diff --git a/python/benchmarks/arrow.py b/python/benchmarks/arrow.py
index 3476722a3f..1c2371c29b 100644
--- a/python/benchmarks/arrow.py
+++ b/python/benchmarks/arrow.py
@@ -41,7 +41,7 @@ def setup_cache(self):
self.logger.info(f"SETUP_CACHE TIME: {time.time() - start}")
def _setup_cache(self):
- self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)
+ self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)
num_rows, date_ranges = self.params
num_cols = 9 # 10 including the index column
self.ac.delete_library(self.lib_name_prewritten)
@@ -63,7 +63,7 @@ def teardown(self, rows, date_range):
del self.ac
def setup(self, rows, date_range):
- self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)
+ self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)
self.lib = self.ac.get_library(self.lib_name_prewritten)
self.lib._nvs._set_allow_arrow_input()
if date_range is None:
@@ -126,7 +126,7 @@ def _generate_table(self, num_rows, num_cols, unique_string_count):
return pa.Table.from_pandas(df)
def _setup_cache(self):
- self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)
+ self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)
num_rows, date_ranges, unique_string_counts, arrow_string_format = self.params
self.ac.delete_library(self.lib_name_prewritten)
self.ac.create_library(self.lib_name_prewritten)
@@ -145,7 +145,7 @@ def teardown(self, rows, date_range, unique_string_count, arrow_string_format):
del self.ac
def setup(self, rows, date_range, unique_string_count, arrow_string_format):
- self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)
+ self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)
self.lib = self.ac.get_library(self.lib_name_prewritten)
self.lib._nvs._set_allow_arrow_input()
if date_range is None:
diff --git a/python/tests/conftest.py b/python/tests/conftest.py
index a795212cb6..4218c9ac06 100644
--- a/python/tests/conftest.py
+++ b/python/tests/conftest.py
@@ -1101,7 +1101,7 @@ def lmdb_version_store_v2(version_store_factory, lib_name) -> NativeVersionStore
@pytest.fixture
def lmdb_version_store_arrow(lmdb_version_store_v1) -> NativeVersionStore:
store = lmdb_version_store_v1
- store.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ store.set_output_format(OutputFormat.PYARROW)
store._set_allow_arrow_input()
return store
@@ -1109,9 +1109,7 @@ def lmdb_version_store_arrow(lmdb_version_store_v1) -> NativeVersionStore:
# Explicitly not including `OutputFormat.EXPERIMENTAL_POLARS` as `polars.to_pandas()` is not index aware, so all
# `assert_frame_equal_with_arrow` would not work. Also POLARS is just a thin wrapper on top of pyarrow, so testing
# just one is sufficent.
-@pytest.fixture(
- params=[OutputFormat.PANDAS, pytest.param(OutputFormat.EXPERIMENTAL_ARROW, marks=PYARROW_POST_PROCESSING)]
-)
+@pytest.fixture(params=[OutputFormat.PANDAS, pytest.param(OutputFormat.PYARROW, marks=PYARROW_POST_PROCESSING)])
def any_output_format(request) -> OutputFormat:
return request.param
@@ -1186,7 +1184,7 @@ def lmdb_version_store_dynamic_schema(
@pytest.fixture
def lmdb_version_store_dynamic_schema_arrow(lmdb_version_store_dynamic_schema_v1) -> NativeVersionStore:
store = lmdb_version_store_dynamic_schema_v1
- store.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ store.set_output_format(OutputFormat.PYARROW)
store._set_allow_arrow_input()
return store
diff --git a/python/tests/integration/arcticdb/test_unicode_strings.py b/python/tests/integration/arcticdb/test_unicode_strings.py
index ba3b78d35f..a2f1af505e 100644
--- a/python/tests/integration/arcticdb/test_unicode_strings.py
+++ b/python/tests/integration/arcticdb/test_unicode_strings.py
@@ -33,9 +33,7 @@ def test_fixed_width_blns(lmdb_version_store, any_arrow_string_format):
vit = lib.read(symbol)
assert_frame_equal(df, vit.data)
# Arrow
- received = cast_string_columns(
- lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data, pa.string()
- ).to_pandas()
+ received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.PYARROW).data, pa.string()).to_pandas()
assert_frame_equal(df, received)
@@ -53,7 +51,7 @@ def test_write_blns(lmdb_version_store, any_arrow_string_format):
lib._set_allow_arrow_input()
table = pa.Table.from_pandas(df)
lib.write(symbol, table, index_column="ts")
- received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data, pa.string())
+ received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.PYARROW).data, pa.string())
assert table.equals(received)
@@ -77,7 +75,7 @@ def test_append_blns(lmdb_version_store, any_arrow_string_format):
table_second_half = pa.Table.from_pandas(df_second_half)
lib.write(symbol, table_first_half, index_column="ts")
lib.append(symbol, table_second_half, index_column="ts")
- received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data, pa.string())
+ received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.PYARROW).data, pa.string())
expected = pa.Table.from_pandas(df)
assert expected.equals(received)
@@ -105,7 +103,7 @@ def test_update_blns(lmdb_version_store, any_arrow_string_format):
table_middle_half = pa.Table.from_pandas(df_middle_half)
lib.write(symbol, table_removed_middle, index_column="ts")
lib.update(symbol, table_middle_half, index_column="ts")
- received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data, pa.string())
+ received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.PYARROW).data, pa.string())
expected = pa.Table.from_pandas(df)
assert expected.equals(received)
@@ -132,7 +130,7 @@ def test_batch_read_blns(lmdb_version_store, any_arrow_string_format):
lib._set_allow_arrow_input()
tables = [pa.Table.from_pandas(df) for df in dfs]
lib.batch_write(symbols, tables, index_column_vector=["ts"] * num_symbols)
- res = lib.batch_read(symbols, query_builder=qbs, output_format=OutputFormat.EXPERIMENTAL_ARROW)
+ res = lib.batch_read(symbols, query_builder=qbs, output_format=OutputFormat.PYARROW)
expr = pa.compute.field("ints") > 50
for idx, sym in enumerate(symbols):
expected = tables[idx]
@@ -165,7 +163,7 @@ def test_recursive_normalizers_blns(lmdb_version_store_v1, any_arrow_string_form
table = pa.Table.from_pandas(df)
dict_data = {s: table for s in keys}
lib.write(symbol, dict_data, recursive_normalizers=True)
- received = lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data
+ received = lib.read(symbol, output_format=OutputFormat.PYARROW).data
for key in keys:
assert key in received.keys()
assert table.equals(cast_string_columns(received[key], pa.string()))
diff --git a/python/tests/unit/arcticdb/test_arrow_api.py b/python/tests/unit/arcticdb/test_arrow_api.py
index a74c2fb0e8..135fbfb0f8 100644
--- a/python/tests/unit/arcticdb/test_arrow_api.py
+++ b/python/tests/unit/arcticdb/test_arrow_api.py
@@ -16,18 +16,18 @@
OutputFormat.PANDAS,
"PANDAS",
"pandas",
- OutputFormat.EXPERIMENTAL_ARROW,
- "EXPERIMENTAL_ARROW",
- "experimental_arrow",
- OutputFormat.EXPERIMENTAL_POLARS,
- "EXPERIMENTAL_POLARS",
- "experimental_polars",
+ OutputFormat.PYARROW,
+ "PYARROW",
+ "pyarrow",
+ OutputFormat.POLARS,
+ "POLARS",
+ "polars",
]
no_str_output_format_args = [
None,
OutputFormat.PANDAS,
- OutputFormat.EXPERIMENTAL_ARROW,
- OutputFormat.EXPERIMENTAL_POLARS,
+ OutputFormat.PYARROW,
+ OutputFormat.POLARS,
]
@@ -37,9 +37,9 @@ def expected_output_type(arctic_output_format, library_output_format, output_for
)
if expected_output_format.lower() == OutputFormat.PANDAS.lower():
return pd.DataFrame
- if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower():
+ if expected_output_format.lower() == OutputFormat.PYARROW.lower():
return pa.Table
- if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower():
+ if expected_output_format.lower() == OutputFormat.POLARS.lower():
return pl.DataFrame
raise ValueError("Unexpected format")
@@ -177,7 +177,7 @@ def test_basic_modifications(lmdb_library, allow_arrow_input):
lib.write(sym, write_table, index_column="ts")
lib.append(sym, append_table, index_column="ts")
lib.update(sym, update_table, index_column="ts")
- received = lib.read(sym, output_format=OutputFormat.EXPERIMENTAL_ARROW).data
+ received = lib.read(sym, output_format=OutputFormat.PYARROW).data
expected = pa.table(
{
"col": pa.array([1, 5, 6, 4], pa.int64()),
@@ -222,7 +222,7 @@ def test_batch_modifications(lmdb_library, allow_arrow_input):
lib.write_batch([WritePayload(sym, write_table, index_column="ts")])
lib.append_batch([WritePayload(sym, append_table, index_column="ts")])
lib.update_batch([UpdatePayload(sym, update_table, index_column="ts")])
- received = lib.read(sym, output_format=OutputFormat.EXPERIMENTAL_ARROW).data
+ received = lib.read(sym, output_format=OutputFormat.PYARROW).data
expected = pa.table(
{
"col": pa.array([1, 5, 6, 4], pa.int64()),
@@ -258,7 +258,7 @@ def test_write_pickle(lmdb_library, batch, allow_arrow_input):
lib.write_pickle(sym, table)
if allow_arrow_input:
assert not lib._nvs.is_symbol_pickled(sym)
- received = lib.read(sym, output_format=OutputFormat.EXPERIMENTAL_ARROW).data
+ received = lib.read(sym, output_format=OutputFormat.PYARROW).data
assert table.equals(received)
else:
assert lib._nvs.is_symbol_pickled(sym)
@@ -288,7 +288,7 @@ def test_stage(lmdb_library, allow_arrow_input):
lib.stage(sym, table_0, index_column="ts")
lib.stage(sym, table_1, index_column="ts")
lib.finalize_staged_data(sym)
- received = lib.read(sym, output_format=OutputFormat.EXPERIMENTAL_ARROW).data
+ received = lib.read(sym, output_format=OutputFormat.PYARROW).data
expected = pa.table(
{
"col": pa.array([1, 2, 3, 4], pa.int64()),
@@ -312,9 +312,7 @@ def test_stage(lmdb_library, allow_arrow_input):
def test_read_arctic_strings(
lmdb_storage, lib_name, arctic_str_format, library_str_format, read_str_format_default, read_str_format_per_column
):
- ac = lmdb_storage.create_arctic(
- output_format=OutputFormat.EXPERIMENTAL_ARROW, arrow_string_format_default=arctic_str_format
- )
+ ac = lmdb_storage.create_arctic(output_format=OutputFormat.PYARROW, arrow_string_format_default=arctic_str_format)
lib = ac.create_library(lib_name, arrow_string_format_default=library_str_format)
sym = "sym"
df = pd.DataFrame({"col": ["some", "strings", "in", "this", "column"]})
@@ -345,7 +343,7 @@ def test_read_arctic_strings(
@pytest.mark.parametrize("lazy", [True, False])
@pytest.mark.parametrize("batch_default", [ArrowOutputStringFormat.SMALL_STRING, None])
def test_read_batch_strings(lmdb_storage, lib_name, lazy, batch_default):
- ac = lmdb_storage.create_arctic(output_format=OutputFormat.EXPERIMENTAL_ARROW)
+ ac = lmdb_storage.create_arctic(output_format=OutputFormat.PYARROW)
lib = ac.create_library(lib_name)
sym_1, sym_2 = "sym_1", "sym_2"
df_1 = pd.DataFrame({"col_1": ["a", "a", "bb"], "col_2": ["x", "y", "z"]})
@@ -380,7 +378,7 @@ def test_read_batch_strings(lmdb_storage, lib_name, lazy, batch_default):
@pytest.mark.parametrize("default", [None, ArrowOutputStringFormat.SMALL_STRING])
@pytest.mark.parametrize("per_column", [None, ArrowOutputStringFormat.CATEGORICAL])
def test_read_batch_and_join_strings(lmdb_storage, lib_name, default, per_column):
- ac = lmdb_storage.create_arctic(output_format=OutputFormat.EXPERIMENTAL_ARROW)
+ ac = lmdb_storage.create_arctic(output_format=OutputFormat.PYARROW)
lib = ac.create_library(lib_name, library_options=LibraryOptions(dynamic_schema=True))
sym_1, sym_2 = "sym_1", "sym_2"
df_1 = pd.DataFrame({"col_1": ["a", "a", "bb"], "col_2": ["x", "y", "z"]})
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow_read.py b/python/tests/unit/arcticdb/version_store/test_arrow_read.py
index 3be3276752..ea5c47af5f 100644
--- a/python/tests/unit/arcticdb/version_store/test_arrow_read.py
+++ b/python/tests/unit/arcticdb/version_store/test_arrow_read.py
@@ -45,7 +45,7 @@ def test_basic_with_index(lmdb_version_store_arrow):
def test_basic_small_slices(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
df = pd.DataFrame({"x": np.arange(10)})
lib.write("arrow", df)
table = lib.read("arrow").data
@@ -54,7 +54,7 @@ def test_basic_small_slices(lmdb_version_store_tiny_segment):
def test_basic_small_slices_with_index(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
df = pd.DataFrame({"x": np.arange(10)}, index=pd.date_range(pd.Timestamp(0), periods=10))
lib.write("arrow", df)
table = lib.read("arrow").data
@@ -137,7 +137,7 @@ def test_strings_basic(lmdb_version_store_arrow, dynamic_strings, any_arrow_stri
@pytest.mark.parametrize("row_range", [None, (2, 3), (2, 4), (2, 5), (2, 6), (3, 4), (3, 5), (3, 6)])
def test_strings_with_nones_and_nans(lmdb_version_store_tiny_segment, row_range, any_arrow_string_format):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
lib.set_arrow_string_format_default(any_arrow_string_format)
# lmdb_version_store_tiny_segment has 2 rows per segment
# This column is constructed so that every 2-element permutation of strings, Nones, and NaNs are tested
@@ -214,7 +214,7 @@ def test_strings_multiple_segments_and_columns(
lmdb_version_store_tiny_segment, dynamic_strings, any_arrow_string_format
):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
lib.set_arrow_string_format_default(any_arrow_string_format)
df = pd.DataFrame(
{
@@ -245,7 +245,7 @@ def test_date_range_corner_cases(version_store_factory, date_range_start, date_r
lib = version_store_factory(
segment_row_size=2, column_group_size=2, dynamic_schema=dynamic_schema, dynamic_strings=True
)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
df = pd.DataFrame(
data={
"col1": np.arange(7),
@@ -281,7 +281,7 @@ def test_date_range_corner_cases(version_store_factory, date_range_start, date_r
)
def test_date_range_between_index_values(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
df = pd.DataFrame(
data={
"col1": np.arange(2),
@@ -316,7 +316,7 @@ def test_date_range_empty_result(version_store_factory, date_range_start, dynami
lib = version_store_factory(
segment_row_size=2, column_group_size=2, dynamic_schema=dynamic_schema, dynamic_strings=True
)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
df = pd.DataFrame(
data={
"col1": np.arange(7),
@@ -349,7 +349,7 @@ def test_date_range_empty_result(version_store_factory, date_range_start, dynami
@pytest.mark.parametrize(
"output_format",
[
- OutputFormat.EXPERIMENTAL_ARROW,
+ OutputFormat.PYARROW,
pytest.param(
OutputFormat.PANDAS,
marks=pytest.mark.skipif(IS_PANDAS_ONE, reason="Monday ref: 18013444785"),
@@ -473,7 +473,7 @@ def test_row_range_corner_cases(version_store_factory, row_range_start, row_rang
lib = version_store_factory(
segment_row_size=2, column_group_size=2, dynamic_schema=dynamic_schema, dynamic_strings=True
)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
df = pd.DataFrame(
data={
"col1": np.arange(7),
@@ -511,7 +511,7 @@ def test_row_range_empty_result(version_store_factory, row_range_start, dynamic_
lib = version_store_factory(
segment_row_size=2, column_group_size=2, dynamic_schema=dynamic_schema, dynamic_strings=True
)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
df = pd.DataFrame(
data={
"col1": np.arange(7),
@@ -541,7 +541,7 @@ def test_row_range_empty_result(version_store_factory, row_range_start, dynamic_
@pytest.mark.parametrize(
"output_format",
[
- OutputFormat.EXPERIMENTAL_ARROW,
+ OutputFormat.PYARROW,
pytest.param(
OutputFormat.PANDAS,
marks=pytest.mark.skipif(IS_PANDAS_ONE, reason="Monday ref: 18013444785"),
@@ -620,7 +620,7 @@ def test_with_querybuilder(lmdb_version_store_arrow):
def test_arrow_layout(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
lib_tool = lib.library_tool()
num_rows = 100
df = pd.DataFrame(
@@ -706,7 +706,7 @@ def test_arrow_dynamic_schema_missing_columns_numeric(version_store_factory, row
if rows_per_column == 100_000 and segment_row_size != 100_000:
pytest.skip("Slow to write and doesn't tell us anything the other variants do not")
lib = version_store_factory(segment_row_size=segment_row_size, dynamic_schema=True)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
sym = "test_arrow_dynamic_schema_missing_columns_numeric"
write_table = pa.table({"col1": pa.array([1] * rows_per_column, pa.int64())})
append_table = pa.table({"col2": pa.array([2] * rows_per_column, pa.int32())})
@@ -814,7 +814,7 @@ def test_arrow_dynamic_schema_missing_columns_hypothesis(
@pytest.mark.parametrize("dynamic_schema", [True, False])
def test_arrow_sparse_floats_basic(version_store_factory, type, dynamic_schema):
lib = version_store_factory(dynamic_schema=dynamic_schema)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
sym = "test_arrow_sparse_floats_basic"
table = pa.table({"col": pa.array([1, None, None, 2, None], type)})
assert table.column("col").null_count == 3
@@ -829,7 +829,7 @@ def test_arrow_sparse_floats_basic(version_store_factory, type, dynamic_schema):
@pytest.mark.parametrize("dynamic_schema", [True, False])
def test_arrow_sparse_floats_row_sliced(version_store_factory, type, dynamic_schema):
lib = version_store_factory(segment_row_size=2, dynamic_schema=dynamic_schema)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
sym = "test_arrow_sparse_floats_row_sliced"
table_0 = pa.table({"col": pa.array([1, None], type)})
table_1 = pa.table({"col": pa.array([2, 3], type)})
@@ -850,7 +850,7 @@ def test_arrow_sparse_floats_row_sliced(version_store_factory, type, dynamic_sch
@pytest.mark.parametrize("date_range_width", list(range(0, 15)))
def test_arrow_sparse_floats_date_range(version_store_factory, dynamic_schema, date_range_start, date_range_width):
lib = version_store_factory(segment_row_size=5, dynamic_schema=dynamic_schema)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
sym = "test_arrow_sparse_floats_date_range"
table_0 = pa.table({"col": pa.array([1, None, 2, None, 3], pa.float64())})
table_1 = pa.table({"col": pa.array([4, None, None, None, None], pa.float64())})
@@ -874,7 +874,7 @@ def test_arrow_sparse_floats_date_range(version_store_factory, dynamic_schema, d
@pytest.mark.parametrize("row_range_width", list(range(0, 15)))
def test_arrow_sparse_floats_row_range(version_store_factory, dynamic_schema, row_range_start, row_range_width):
lib = version_store_factory(segment_row_size=5, dynamic_schema=dynamic_schema)
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
sym = "test_arrow_sparse_floats_row_range"
table_0 = pa.table({"col": pa.array([1, None, 2, None, 3], pa.float64())})
table_1 = pa.table({"col": pa.array([4, None, None, None, None], pa.float64())})
@@ -955,7 +955,7 @@ def test_arrow_dynamic_schema_filtered_column(lmdb_version_store_dynamic_schema_
def test_project_dynamic_schema(lmdb_version_store_dynamic_schema_arrow):
lib = lmdb_version_store_dynamic_schema_arrow
- lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW)
+ lib.set_output_format(OutputFormat.PYARROW)
sym = "sym"
table_1 = pa.table({"a": pa.array([1, 2])})
table_2 = pa.table({"a": pa.array([3, 4]), "b": pa.array([1, 2])})
@@ -1161,7 +1161,7 @@ def test_arrow_read_batch_with_strings(lmdb_version_store_arrow):
def test_polars_basic(lmdb_version_store_arrow):
lib = lmdb_version_store_arrow
- lib.set_output_format(OutputFormat.EXPERIMENTAL_POLARS)
+ lib.set_output_format(OutputFormat.POLARS)
sym = "polars"
df = pd.DataFrame(
{
@@ -1182,3 +1182,17 @@ def test_polars_basic(lmdb_version_store_arrow):
assert result.columns == ["__index__", "col_int", "col_float", "col_bool", "col_str", "col_cat"]
assert result.dtypes == [pl.Datetime("ns"), pl.Int64, pl.Float32, pl.Boolean, pl.String, pl.Categorical]
assert result.equals(expected)
+
+
+def test_polars_unsupported_small_string(lmdb_version_store_arrow):
+ lib = lmdb_version_store_arrow
+ lib.set_output_format(OutputFormat.POLARS)
+ sym = "polars"
+ df = pd.DataFrame({"col": ["a", "bb"]})
+ lib.write(sym, df)
+
+ with pytest.raises(ValueError):
+ lib.read(sym, arrow_string_format_default=pa.string())
+
+ with pytest.raises(ValueError):
+ lib.read(sym, arrow_string_format_per_column={"col": ArrowOutputStringFormat.SMALL_STRING}).data
diff --git a/python/tests/unit/arcticdb/version_store/test_arrow_writes.py b/python/tests/unit/arcticdb/version_store/test_arrow_writes.py
index c12babe4c0..36232884b6 100644
--- a/python/tests/unit/arcticdb/version_store/test_arrow_writes.py
+++ b/python/tests/unit/arcticdb/version_store/test_arrow_writes.py
@@ -168,7 +168,7 @@ def test_write_multiple_record_batches_indexed(lmdb_version_store_arrow):
@pytest.mark.parametrize("num_cols", [1, 2, 3, 4, 5])
def test_write_sliced(lmdb_version_store_tiny_segment, num_rows, num_cols):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
sym = "test_write_sliced"
table = pa.table(
@@ -200,7 +200,7 @@ def test_write_sliced(lmdb_version_store_tiny_segment, num_rows, num_cols):
)
def test_write_bools_sliced(lmdb_version_store_tiny_segment, data):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
sym = "test_write_bools_sliced"
table = pa.table({"col": pa.array(data)})
@@ -213,7 +213,7 @@ def test_write_bools_sliced(lmdb_version_store_tiny_segment, data):
@pytest.mark.parametrize("num_cols", [1, 2, 3, 4, 5])
def test_write_sliced_with_index(lmdb_version_store_tiny_segment, num_rows, num_cols):
lib = lmdb_version_store_tiny_segment
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
lib_tool = lib.library_tool()
sym = "test_write_sliced_with_index"
@@ -253,7 +253,7 @@ def test_write_sliced_with_index(lmdb_version_store_tiny_segment, num_rows, num_
def test_many_record_batches_many_slices(version_store_factory, rows_per_slice):
rng = np.random.default_rng()
lib = version_store_factory(segment_row_size=rows_per_slice)
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
sym = "test_many_record_batches_many_slices"
record_batch_sizes = [1, 2, 1, 15, 13, 2, 1, 10]
@@ -309,7 +309,7 @@ def test_write_view_strings(lmdb_version_store_arrow, type):
def test_write_owned_and_non_owned_buffers(lmdb_version_store_tiny_segment):
# This test is about our ChunkedBuffer holding mixes of owned and non-owned blocks, not Arrow views
lib = lmdb_version_store_tiny_segment
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
sym = "test_write_owned_and_non_owned_buffers"
rb0 = pa.RecordBatch.from_arrays([pa.array([0, 1, 2], pa.int32())], names=["col"])
@@ -628,7 +628,7 @@ def test_update_with_date_range_narrower_than_data(lmdb_version_store_arrow, dat
def test_staging_without_sorting(version_store_factory, method):
lib = version_store_factory(segment_row_size=2, dynamic_schema=True)
lib_tool = lib.library_tool()
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
sym = "test_staging_without_sorting"
table_0 = pa.table(
@@ -672,7 +672,7 @@ def test_staging_without_sorting(version_store_factory, method):
def test_staging_with_sorting(version_store_factory):
lib = version_store_factory(segment_row_size=2, dynamic_schema=True)
lib_tool = lib.library_tool()
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
sym = "test_staging_with_sorting"
table_0 = pa.table(
@@ -706,7 +706,7 @@ def test_staging_with_sorting(version_store_factory):
def test_staging_with_sorting_strings(version_store_factory):
lib = version_store_factory(segment_row_size=2, dynamic_schema=True)
lib_tool = lib.library_tool()
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
sym = "test_staging_with_sorting_strings"
table_0 = pa.table(
@@ -865,7 +865,7 @@ def test_arrow_writes_hypothesis(
cols_per_slice = num_supported_types // max_col_slices
lib.lib_cfg().lib_desc.version.write_options.segment_row_size = rows_per_slice
lib.lib_cfg().lib_desc.version.write_options.column_group_size = cols_per_slice
- lib.set_output_format("experimental_arrow")
+ lib.set_output_format("pyarrow")
lib._set_allow_arrow_input()
naughty_strings = read_big_list_of_naughty_strings()
data = {}