diff --git a/docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb b/docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb new file mode 100644 index 0000000000..726df25d7d --- /dev/null +++ b/docs/mkdocs/docs/notebooks/ArcticDB_demo_read_as_arrow.ipynb @@ -0,0 +1,2321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d6c62b62-3509-4c2b-9d6f-bc3dc0187c72", + "metadata": {}, + "source": [ + "# ArcticDB Read as Arrow demo" + ] + }, + { + "cell_type": "markdown", + "id": "6a677f71-55d8-446d-a372-8b5bb0bc7e02", + "metadata": {}, + "source": [ + "This notebook demonstrates ArcticDB's Arrow-based output formats: **PyArrow** and **Polars**. These formats provide performance improvements for string-heavy workloads, and allow better integrations with modern Arrow-based table processing libraries like `polars` and `duckdb`.\n", + "\n", + "**Key benefits:**\n", + "- **Better performance**: Particularly for string columns (no GIL required)\n", + "- **Zero-copy integration with Arrow**: Zero-copy pass dataframes from `arctidb` to third party libraries like `duckdb`\n", + "\n", + "**Notebook structure:**\n", + "1. Setup and basic usage\n", + "2. Polars output format\n", + "3. PyArrow output format and record batch structure\n", + "4. Configurable string formats\n", + "5. Pandas interoperability and column naming\n", + "6. Performance benchmarks" + ] + }, + { + "cell_type": "markdown", + "id": "ec71d190-49c7-4d24-bea9-c27b72f5a2b6", + "metadata": {}, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9b93432d-a9f5-4e32-8b39-be7cdcb382eb", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", + "import pyarrow as pa\n", + "from arcticdb import Arctic, LibraryOptions, OutputFormat, ArrowOutputStringFormat, where" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8df3c551-cbaf-4401-b9e7-427b25762a5f", + "metadata": {}, + "outputs": [], + "source": [ + "ac = Arctic(\"lmdb://tmp/arrow_reads_demo\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "242dfd1e-da6c-4186-abf2-27c6ccac922f", + "metadata": {}, + "outputs": [], + "source": [ + "ac.delete_library(\"arrow\")\n", + "lib = ac.create_library(\"arrow\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "174660b5-0c46-4152-b5c3-5e49a6ec76d8", + "metadata": {}, + "outputs": [], + "source": [ + "sym = \"test\"" + ] + }, + { + "cell_type": "markdown", + "id": "bd6573bb-1bbf-42ac-9c0c-0c99fa6df328", + "metadata": {}, + "source": [ + "# Polars Output Format\n", + "\n", + "All ArcticDB operations returning dataframes (e.g. `read`, `head`, `tail`, `read_batch`, `read_batch_and_join`) accept an `output_format` which controls in which format will the data be returned. The default output format is still `OutputFormat.PANDAS` which return `pd.DataFrame`s as before.\n", + "\n", + "The Apache Arrow memory layout based output formats are `POLARS` which returns `polars.DataFrame` objects and `PYARROW` which returns `pyarrow.Table` objects.\n", + "\n", + "Let's see several examples of using `POLARS` output format" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6dd149ac-71c8-42a9-993e-10596975e161", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col_intcol_floatcol_str
0010.0value_0
1111.0value_1
2212.0value_2
3313.0value_3
4414.0value_4
5515.0value_5
6616.0value_6
7717.0value_7
8818.0value_8
9919.0value_9
\n", + "
" + ], + "text/plain": [ + " col_int col_float col_str\n", + "0 0 10.0 value_0\n", + "1 1 11.0 value_1\n", + "2 2 12.0 value_2\n", + "3 3 13.0 value_3\n", + "4 4 14.0 value_4\n", + "5 5 15.0 value_5\n", + "6 6 16.0 value_6\n", + "7 7 17.0 value_7\n", + "8 8 18.0 value_8\n", + "9 9 19.0 value_9" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## Basic Usage\n", + "\n", + "# Write some data with Pandas (writing Polars/Arrow directly is not yet supported)\n", + "df = pd.DataFrame({\n", + " \"col_int\": np.arange(10, dtype=np.int64),\n", + " \"col_float\": np.arange(10, 20, dtype=np.float64),\n", + " \"col_str\": [f\"value_{i}\" for i in range(10)]\n", + "})\n", + "lib.write(\"demo\", df)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "f0b8ef49-3b92-4805-872a-eb0875959f87", + "metadata": {}, + "source": [ + "Read back as a Polars DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "df1c3b81-9727-4a3e-8e76-31878c82a4a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type: \n", + "Dtypes: [Int64, Float64, String]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 3)
col_intcol_floatcol_str
i64f64str
010.0"value_0"
111.0"value_1"
212.0"value_2"
313.0"value_3"
414.0"value_4"
515.0"value_5"
616.0"value_6"
717.0"value_7"
818.0"value_8"
919.0"value_9"
" + ], + "text/plain": [ + "shape: (10, 3)\n", + "┌─────────┬───────────┬─────────┐\n", + "│ col_int ┆ col_float ┆ col_str │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ str │\n", + "╞═════════╪═══════════╪═════════╡\n", + "│ 0 ┆ 10.0 ┆ value_0 │\n", + "│ 1 ┆ 11.0 ┆ value_1 │\n", + "│ 2 ┆ 12.0 ┆ value_2 │\n", + "│ 3 ┆ 13.0 ┆ value_3 │\n", + "│ 4 ┆ 14.0 ┆ value_4 │\n", + "│ 5 ┆ 15.0 ┆ value_5 │\n", + "│ 6 ┆ 16.0 ┆ value_6 │\n", + "│ 7 ┆ 17.0 ┆ value_7 │\n", + "│ 8 ┆ 18.0 ┆ value_8 │\n", + "│ 9 ┆ 19.0 ┆ value_9 │\n", + "└─────────┴───────────┴─────────┘" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "polars_df = lib.read(\"demo\", output_format=OutputFormat.POLARS).data\n", + "print(f\"Type: {type(polars_df)}\")\n", + "print(f\"Dtypes: {polars_df.dtypes}\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "e8de754e-90c7-47bb-a12b-33495fad0d83", + "metadata": {}, + "source": [ + "### Index Handling\n", + "\n", + "Note that default `RangeIndex` is dropped (Polars has no concept of row indexes):" + ] + }, + { + "cell_type": "markdown", + "id": "8f7dce55-bb59-4830-a8b6-ade87d5ef755", + "metadata": {}, + "source": [ + "## Configuration Levels\n", + "\n", + "Output format can be set at three levels (Arctic instance, Library, or per-read):" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "960cc5b2-9f5c-49ba-9a40-be55c6ae0ca3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
col_intcol_floatcol_str
i64f64str
010.0"value_0"
111.0"value_1"
212.0"value_2"
313.0"value_3"
414.0"value_4"
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌─────────┬───────────┬─────────┐\n", + "│ col_int ┆ col_float ┆ col_str │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ str │\n", + "╞═════════╪═══════════╪═════════╡\n", + "│ 0 ┆ 10.0 ┆ value_0 │\n", + "│ 1 ┆ 11.0 ┆ value_1 │\n", + "│ 2 ┆ 12.0 ┆ value_2 │\n", + "│ 3 ┆ 13.0 ┆ value_3 │\n", + "│ 4 ┆ 14.0 ┆ value_4 │\n", + "└─────────┴───────────┴─────────┘" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Per-read (most granular)\n", + "polars_df = lib.read(\"demo\", output_format=OutputFormat.POLARS).data\n", + "\n", + "# Case-insensitive strings also work\n", + "polars_df = lib.head(\"demo\", output_format=\"polars\").data\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "9faf94c3-bde3-440b-9bfb-799e9f0602b8", + "metadata": {}, + "source": [ + "Set at library level:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e4c1a7cf-f223-4273-84f2-9d3c8dcce21a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
col_intcol_floatcol_str
i64f64str
010.0"value_0"
111.0"value_1"
212.0"value_2"
313.0"value_3"
414.0"value_4"
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌─────────┬───────────┬─────────┐\n", + "│ col_int ┆ col_float ┆ col_str │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ str │\n", + "╞═════════╪═══════════╪═════════╡\n", + "│ 0 ┆ 10.0 ┆ value_0 │\n", + "│ 1 ┆ 11.0 ┆ value_1 │\n", + "│ 2 ┆ 12.0 ┆ value_2 │\n", + "│ 3 ┆ 13.0 ┆ value_3 │\n", + "│ 4 ┆ 14.0 ┆ value_4 │\n", + "└─────────┴───────────┴─────────┘" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lib_polars = ac.get_library(\"arrow\", output_format=OutputFormat.POLARS)\n", + "# Now all reads default to Polars\n", + "lib_polars.head(\"demo\").data" + ] + }, + { + "cell_type": "markdown", + "id": "5bdcc95b-5dee-4bc6-9ba3-026ef02a852a", + "metadata": {}, + "source": [ + "### Or on the entire `Arctic` instance, so all libraries fetched from this instance use Arrow as the default return type" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bcb30551-b293-4d76-be14-24c9147f35f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
col_intcol_floatcol_str
i64f64str
010.0"value_0"
111.0"value_1"
212.0"value_2"
313.0"value_3"
414.0"value_4"
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌─────────┬───────────┬─────────┐\n", + "│ col_int ┆ col_float ┆ col_str │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ str │\n", + "╞═════════╪═══════════╪═════════╡\n", + "│ 0 ┆ 10.0 ┆ value_0 │\n", + "│ 1 ┆ 11.0 ┆ value_1 │\n", + "│ 2 ┆ 12.0 ┆ value_2 │\n", + "│ 3 ┆ 13.0 ┆ value_3 │\n", + "│ 4 ┆ 14.0 ┆ value_4 │\n", + "└─────────┴───────────┴─────────┘" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "polars_ac = Arctic(\"lmdb://tmp/arrow_reads_demo_2\", output_format=OutputFormat.POLARS)\n", + "\n", + "# Now all libraries will have a default polars output format\n", + "polars_ac.delete_library(\"arrow\")\n", + "lib = polars_ac.create_library(\"arrow\")\n", + "lib.write(\"demo\", df)\n", + "lib.head(\"demo\").data" + ] + }, + { + "cell_type": "markdown", + "id": "5e5d9e23-1827-4010-94fa-b5464447eefb", + "metadata": {}, + "source": [ + "You can always override back to Pandas for individual reads:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e99eebb4-d9ae-4872-a161-abff22be81f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.frame.DataFrame" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pandas_df = lib.read(\"demo\", output_format=OutputFormat.PANDAS).data\n", + "type(pandas_df)" + ] + }, + { + "cell_type": "markdown", + "id": "8b7b21b0-17ac-41ab-b22e-dcc7bec13eda", + "metadata": {}, + "source": [ + "## Working with timeseries indices\n", + "\n", + "Timeseries indices appear as the first column:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4f714e04-f40e-4035-b8f8-4ef1602aae0b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col_intcol_float
timestamp
2025-01-01010.0
2025-01-02111.0
2025-01-03212.0
2025-01-04313.0
2025-01-05414.0
2025-01-06515.0
2025-01-07616.0
2025-01-08717.0
2025-01-09818.0
2025-01-10919.0
\n", + "
" + ], + "text/plain": [ + " col_int col_float\n", + "timestamp \n", + "2025-01-01 0 10.0\n", + "2025-01-02 1 11.0\n", + "2025-01-03 2 12.0\n", + "2025-01-04 3 13.0\n", + "2025-01-05 4 14.0\n", + "2025-01-06 5 15.0\n", + "2025-01-07 6 16.0\n", + "2025-01-08 7 17.0\n", + "2025-01-09 8 18.0\n", + "2025-01-10 9 19.0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({\n", + " \"col_int\": np.arange(10, dtype=np.int64),\n", + " \"col_float\": np.arange(10, 20, dtype=np.float64)\n", + "}, index=pd.date_range(\"2025-01-01\", periods=10))\n", + "df.index.name = \"timestamp\"\n", + "lib.write(\"timeseries\", df)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a04c8ea1-0a44-44c6-bc27-32ae8b3d5872", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Columns: ['timestamp', 'col_int', 'col_float']\n", + "Dtypes: [Datetime(time_unit='ns', time_zone=None), Int64, Float64]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 3)
timestampcol_intcol_float
datetime[ns]i64f64
2025-01-01 00:00:00010.0
2025-01-02 00:00:00111.0
2025-01-03 00:00:00212.0
2025-01-04 00:00:00313.0
2025-01-05 00:00:00414.0
2025-01-06 00:00:00515.0
2025-01-07 00:00:00616.0
2025-01-08 00:00:00717.0
2025-01-09 00:00:00818.0
2025-01-10 00:00:00919.0
" + ], + "text/plain": [ + "shape: (10, 3)\n", + "┌─────────────────────┬─────────┬───────────┐\n", + "│ timestamp ┆ col_int ┆ col_float │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ i64 ┆ f64 │\n", + "╞═════════════════════╪═════════╪═══════════╡\n", + "│ 2025-01-01 00:00:00 ┆ 0 ┆ 10.0 │\n", + "│ 2025-01-02 00:00:00 ┆ 1 ┆ 11.0 │\n", + "│ 2025-01-03 00:00:00 ┆ 2 ┆ 12.0 │\n", + "│ 2025-01-04 00:00:00 ┆ 3 ┆ 13.0 │\n", + "│ 2025-01-05 00:00:00 ┆ 4 ┆ 14.0 │\n", + "│ 2025-01-06 00:00:00 ┆ 5 ┆ 15.0 │\n", + "│ 2025-01-07 00:00:00 ┆ 6 ┆ 16.0 │\n", + "│ 2025-01-08 00:00:00 ┆ 7 ┆ 17.0 │\n", + "│ 2025-01-09 00:00:00 ┆ 8 ┆ 18.0 │\n", + "│ 2025-01-10 00:00:00 ┆ 9 ┆ 19.0 │\n", + "└─────────────────────┴─────────┴───────────┘" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "polars_df = lib.read(\"timeseries\").data\n", + "print(f\"Columns: {polars_df.columns}\")\n", + "print(f\"Dtypes: {polars_df.dtypes}\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "5ede9724-bcd1-4502-8880-f2a7c9f1159f", + "metadata": {}, + "source": [ + "Timezones are preserved:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "cfc90885-1ae3-4df6-8c0c-d8a90e1ada0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
timestamp
2025-01-01 00:00:00-05:000
2025-01-02 00:00:00-05:001
2025-01-03 00:00:00-05:002
2025-01-04 00:00:00-05:003
2025-01-05 00:00:00-05:004
\n", + "
" + ], + "text/plain": [ + " value\n", + "timestamp \n", + "2025-01-01 00:00:00-05:00 0\n", + "2025-01-02 00:00:00-05:00 1\n", + "2025-01-03 00:00:00-05:00 2\n", + "2025-01-04 00:00:00-05:00 3\n", + "2025-01-05 00:00:00-05:00 4" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_tz = pd.DataFrame({\n", + " \"value\": np.arange(5)\n", + "}, index=pd.date_range(\"2025-01-01\", periods=5, tz=\"America/New_York\"))\n", + "df_tz.index.name = \"timestamp\"\n", + "lib.write(\"tz_data\", df_tz)\n", + "df_tz" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "bfc4d467-8c42-4a60-b84b-b57a8a4ace1b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timezone preserved: Datetime(time_unit='ns', time_zone='America/New_York')\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
timestampvalue
datetime[ns, America/New_York]i64
2025-01-01 00:00:00 EST0
2025-01-02 00:00:00 EST1
2025-01-03 00:00:00 EST2
2025-01-04 00:00:00 EST3
2025-01-05 00:00:00 EST4
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌────────────────────────────────┬───────┐\n", + "│ timestamp ┆ value │\n", + "│ --- ┆ --- │\n", + "│ datetime[ns, America/New_York] ┆ i64 │\n", + "╞════════════════════════════════╪═══════╡\n", + "│ 2025-01-01 00:00:00 EST ┆ 0 │\n", + "│ 2025-01-02 00:00:00 EST ┆ 1 │\n", + "│ 2025-01-03 00:00:00 EST ┆ 2 │\n", + "│ 2025-01-04 00:00:00 EST ┆ 3 │\n", + "│ 2025-01-05 00:00:00 EST ┆ 4 │\n", + "└────────────────────────────────┴───────┘" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "polars_df = lib.read(\"tz_data\").data\n", + "print(f\"Timezone preserved: {polars_df['timestamp'].dtype}\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "9c21d95e-d57e-4e1c-8b67-fb7060a5f39b", + "metadata": {}, + "source": [ + "## Filtering and Column Selection\n", + "\n", + "Standard ArcticDB operations work with Arrow output formats:\n", + "\n", + "*Note that index column is fetched even if not specified in the `columns` list.*" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e978dfd1-9164-4875-9c11-6b5d258fa81c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (4, 2)
timestampcol_int
datetime[ns]i64
2025-01-03 00:00:002
2025-01-04 00:00:003
2025-01-05 00:00:004
2025-01-06 00:00:005
" + ], + "text/plain": [ + "shape: (4, 2)\n", + "┌─────────────────────┬─────────┐\n", + "│ timestamp ┆ col_int │\n", + "│ --- ┆ --- │\n", + "│ datetime[ns] ┆ i64 │\n", + "╞═════════════════════╪═════════╡\n", + "│ 2025-01-03 00:00:00 ┆ 2 │\n", + "│ 2025-01-04 00:00:00 ┆ 3 │\n", + "│ 2025-01-05 00:00:00 ┆ 4 │\n", + "│ 2025-01-06 00:00:00 ┆ 5 │\n", + "└─────────────────────┴─────────┘" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "polars_df = lib.read(\n", + " \"timeseries\",\n", + " date_range=(pd.Timestamp(\"2025-01-03\"), pd.Timestamp(\"2025-01-06\")),\n", + " columns=[\"col_int\"]\n", + ").data\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "5c15e500", + "metadata": {}, + "source": [ + "# PyArrow Output Format\n", + "\n", + "The PyArrow output format returns `pyarrow.Table` objects. While we recommend Polars for most arrow use cases, PyArrow is useful when you want to integrate with other Arrow-based tools which expect a `pyarrow.Table`." + ] + }, + { + "cell_type": "markdown", + "id": "894b94d0", + "metadata": {}, + "source": [ + "## Basic PyArrow Usage" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "19f5934d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type: \n", + "Schema: col_int: int64\n", + "col_float: double\n", + "col_str: large_string\n", + "-- schema metadata --\n", + "pandas: '{\"index_columns\": [{\"name\": null, \"start\": 0, \"step\": 1, \"stop\":' + 500\n" + ] + }, + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "col_int: int64\n", + "col_float: double\n", + "col_str: large_string\n", + "----\n", + "col_int: [[0,1,2,3,4,5,6,7,8,9]]\n", + "col_float: [[10,11,12,13,14,15,16,17,18,19]]\n", + "col_str: [[\"value_0\",\"value_1\",\"value_2\",\"value_3\",\"value_4\",\"value_5\",\"value_6\",\"value_7\",\"value_8\",\"value_9\"]]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "arrow_table = lib.read(\"demo\", output_format=OutputFormat.PYARROW).data\n", + "print(f\"Type: {type(arrow_table)}\")\n", + "print(f\"Schema: {arrow_table.schema}\")\n", + "arrow_table" + ] + }, + { + "cell_type": "markdown", + "id": "c08b82c0", + "metadata": {}, + "source": [ + "## Zero-Copy Conversion Between Arrow based libraries\n", + "\n", + "Converting between PyArrow and Polars is zero-copy since they share the same memory layout:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "64d70f8f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Converted to Polars: \n", + "Converted back to Arrow: \n", + "Tables equal: True\n" + ] + } + ], + "source": [ + "# PyArrow to Polars (zero-copy)\n", + "polars_from_arrow = pl.from_arrow(arrow_table)\n", + "print(f\"Converted to Polars: {type(polars_from_arrow)}\")\n", + "\n", + "# Polars to PyArrow (zero-copy)\n", + "arrow_from_polars = polars_from_arrow.to_arrow()\n", + "print(f\"Converted back to Arrow: {type(arrow_from_polars)}\")\n", + "print(f\"Tables equal: {arrow_table.equals(arrow_from_polars)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7a9e1551", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result type: \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (4, 4)
timestampcol_intcol_floatproduct
datetime[ns]i64f64f64
2025-01-07 00:00:00616.096.0
2025-01-08 00:00:00717.0119.0
2025-01-09 00:00:00818.0144.0
2025-01-10 00:00:00919.0171.0
" + ], + "text/plain": [ + "shape: (4, 4)\n", + "┌─────────────────────┬─────────┬───────────┬─────────┐\n", + "│ timestamp ┆ col_int ┆ col_float ┆ product │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ i64 ┆ f64 ┆ f64 │\n", + "╞═════════════════════╪═════════╪═══════════╪═════════╡\n", + "│ 2025-01-07 00:00:00 ┆ 6 ┆ 16.0 ┆ 96.0 │\n", + "│ 2025-01-08 00:00:00 ┆ 7 ┆ 17.0 ┆ 119.0 │\n", + "│ 2025-01-09 00:00:00 ┆ 8 ┆ 18.0 ┆ 144.0 │\n", + "│ 2025-01-10 00:00:00 ┆ 9 ┆ 19.0 ┆ 171.0 │\n", + "└─────────────────────┴─────────┴───────────┴─────────┘" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import duckdb\n", + "\n", + "# Load data from ArcticDB as PyArrow table\n", + "arrow_table = lib.read(\"timeseries\", output_format=OutputFormat.PYARROW).data\n", + "\n", + "# Query directly with DuckDB (zero-copy)\n", + "result = duckdb.query(\"\"\"\n", + " SELECT \n", + " timestamp,\n", + " col_int,\n", + " col_float,\n", + " col_int * col_float AS product\n", + " FROM arrow_table\n", + " WHERE col_int > 5\n", + " ORDER BY timestamp\n", + "\"\"\").to_arrow_table()\n", + "\n", + "print(f\"Result type: {type(result)}\")\n", + "# Converting to polars for easier viewing\n", + "pl.from_arrow(result)" + ] + }, + { + "cell_type": "markdown", + "id": "ab9bc734-89ed-4683-b9e5-6ce0d5583dfd", + "metadata": {}, + "source": [ + "## Record Batch Structure\n", + "\n", + "Large writes, appends, and updates result in Arrow based output formats composed of multiple record batches. Each record batch corresponds to a row-slice in ArcticDB's storage.\n", + "This applies to both Pyarrow and Polars. With pyarrow we can see the record batch separation directly." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "7e57da3b-9156-4433-9188-80a85ae07955", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of record batches: 2\n", + "Table structure: pyarrow.Table\n", + "col1: int64\n", + "----\n", + "col1: [[0,1],[2,3]]\n" + ] + }, + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "col1: int64\n", + "----\n", + "col1: [[0,1],[2,3]]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_0 = pd.DataFrame({\"col1\": np.arange(2, dtype=np.int64)})\n", + "df_1 = pd.DataFrame({\"col1\": np.arange(2, 4, dtype=np.int64)})\n", + "lib.write(\"multi_batch\", df_0)\n", + "lib.append(\"multi_batch\", df_1)\n", + "\n", + "arrow_table = lib.read(\"multi_batch\", output_format=OutputFormat.PYARROW).data\n", + "print(f\"Number of record batches: {len(arrow_table.to_batches())}\")\n", + "print(f\"Table structure: {arrow_table}\")\n", + "arrow_table" + ] + }, + { + "cell_type": "markdown", + "id": "5c2e0cce-9c0a-4bc9-b9f8-0f2e56c2ab56", + "metadata": {}, + "source": [ + "You can combine chunks into a single contiguous table (involves memory allocation and copying):" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5ca9a985-e9fd-4b23-a330-4ec2d41e7c22", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Combined equals original: True\n" + ] + }, + { + "data": { + "text/plain": [ + "pyarrow.Table\n", + "col1: int64\n", + "----\n", + "col1: [[0,1,2,3]]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "contiguous_table = arrow_table.combine_chunks()\n", + "print(f\"Combined equals original: {contiguous_table.equals(arrow_table)}\")\n", + "contiguous_table" + ] + }, + { + "cell_type": "markdown", + "id": "6886794a", + "metadata": {}, + "source": [ + "Chunks can also be combined with polars output format with `rechunk`.\n", + "\n", + "After rechunking some computations on the polars frame will be faster." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "bcaf366c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of chunks in Polars DataFrame: 2\n", + "Number of chunks after rechunking: 1\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (4, 1)
col1
i64
0
1
2
3
" + ], + "text/plain": [ + "shape: (4, 1)\n", + "┌──────┐\n", + "│ col1 │\n", + "│ --- │\n", + "│ i64 │\n", + "╞══════╡\n", + "│ 0 │\n", + "│ 1 │\n", + "│ 2 │\n", + "│ 3 │\n", + "└──────┘" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "polars_df = lib.read(\"multi_batch\", output_format=OutputFormat.POLARS).data\n", + "print(\"Number of chunks in Polars DataFrame:\", polars_df.n_chunks())\n", + "polars_df = polars_df.rechunk()\n", + "print(\"Number of chunks after rechunking:\", polars_df.n_chunks())\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "36424fda", + "metadata": {}, + "source": [ + "# Configurable String Formats\n", + "\n", + "Arrow-based output formats support configurable string encoding to optimize for your data characteristics. Three formats are available: `LARGE_STRING` (default), `SMALL_STRING` (only for PyArrow), and `CATEGORICAL`." + ] + }, + { + "cell_type": "markdown", + "id": "f886c2d3", + "metadata": {}, + "source": [ + "## String Format Options\n", + "\n", + "- **`LARGE_STRING`** (default): 64-bit variable-size encoding, best for general use\n", + " - PyArrow: `pa.large_string()`, Polars: `pl.String`\n", + " \n", + "- **`SMALL_STRING`**: 32-bit variable-size encoding, slightly more memory efficient for smaller data\n", + " - PyArrow: `pa.string()`, Polars: Not supported\n", + " - Polars can only use large_strings or categoricals\n", + " \n", + "- **`CATEGORICAL`**: Dictionary-encoded, best for low cardinality (few unique values)\n", + " - PyArrow: `pa.dictionary(pa.int32(), pa.large_string())`, Polars: `pl.Categorical`" + ] + }, + { + "cell_type": "markdown", + "id": "5695dd5c", + "metadata": {}, + "source": [ + "## Example: Default LARGE_STRING Format" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "dbf7608d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dtypes: [String, String]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 2)
categorydescription
strstr
"A""Long description text 0"
"B""Long description text 1"
"A""Long description text 2"
"B""Long description text 3"
"A""Long description text 4"
"A""Long description text 5"
"B""Long description text 6"
"A""Long description text 7"
"B""Long description text 8"
"A""Long description text 9"
" + ], + "text/plain": [ + "shape: (10, 2)\n", + "┌──────────┬─────────────────────────┐\n", + "│ category ┆ description │\n", + "│ --- ┆ --- │\n", + "│ str ┆ str │\n", + "╞══════════╪═════════════════════════╡\n", + "│ A ┆ Long description text 0 │\n", + "│ B ┆ Long description text 1 │\n", + "│ A ┆ Long description text 2 │\n", + "│ B ┆ Long description text 3 │\n", + "│ A ┆ Long description text 4 │\n", + "│ A ┆ Long description text 5 │\n", + "│ B ┆ Long description text 6 │\n", + "│ A ┆ Long description text 7 │\n", + "│ B ┆ Long description text 8 │\n", + "│ A ┆ Long description text 9 │\n", + "└──────────┴─────────────────────────┘" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_strings = pd.DataFrame({\n", + " \"category\": [\"A\", \"B\", \"A\", \"B\", \"A\"] * 2, # Low cardinality\n", + " \"description\": [f\"Long description text {i}\" for i in range(10)] # High cardinality\n", + "})\n", + "lib.write(\"strings\", df_strings)\n", + "\n", + "# Default behavior (LARGE_STRING)\n", + "polars_df = lib.read(\"strings\", output_format=OutputFormat.POLARS).data\n", + "print(f\"Dtypes: {polars_df.dtypes}\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "44fae83a", + "metadata": {}, + "source": [ + "## Example: CATEGORICAL Format for Low Cardinality\n", + "\n", + "Use `CATEGORICAL` when you have few unique values repeated many times:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c8015891", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dtypes: [Categorical, Categorical]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 2)
categorydescription
catcat
"A""Long description text 0"
"B""Long description text 1"
"A""Long description text 2"
"B""Long description text 3"
"A""Long description text 4"
"A""Long description text 5"
"B""Long description text 6"
"A""Long description text 7"
"B""Long description text 8"
"A""Long description text 9"
" + ], + "text/plain": [ + "shape: (10, 2)\n", + "┌──────────┬─────────────────────────┐\n", + "│ category ┆ description │\n", + "│ --- ┆ --- │\n", + "│ cat ┆ cat │\n", + "╞══════════╪═════════════════════════╡\n", + "│ A ┆ Long description text 0 │\n", + "│ B ┆ Long description text 1 │\n", + "│ A ┆ Long description text 2 │\n", + "│ B ┆ Long description text 3 │\n", + "│ A ┆ Long description text 4 │\n", + "│ A ┆ Long description text 5 │\n", + "│ B ┆ Long description text 6 │\n", + "│ A ┆ Long description text 7 │\n", + "│ B ┆ Long description text 8 │\n", + "│ A ┆ Long description text 9 │\n", + "└──────────┴─────────────────────────┘" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set categorical for all string columns\n", + "polars_df_cat = lib.read(\n", + " \"strings\",\n", + " output_format=OutputFormat.POLARS,\n", + " arrow_string_format_default=ArrowOutputStringFormat.CATEGORICAL\n", + ").data\n", + "print(f\"Dtypes: {polars_df_cat.dtypes}\")\n", + "polars_df_cat" + ] + }, + { + "cell_type": "markdown", + "id": "4663a9e3", + "metadata": {}, + "source": [ + "## Per-Column String Format Configuration\n", + "\n", + "Optimize each column individually:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "ab0633b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dtypes: [Categorical, String]\n", + "Category is Categorical: True\n", + "Description is String: True\n" + ] + } + ], + "source": [ + "# Categorical for low cardinality, LARGE_STRING for high cardinality\n", + "polars_df_mixed = lib.read(\n", + " \"strings\",\n", + " output_format=OutputFormat.POLARS,\n", + " arrow_string_format_per_column={\n", + " \"category\": ArrowOutputStringFormat.CATEGORICAL,\n", + " \"description\": ArrowOutputStringFormat.LARGE_STRING\n", + " }\n", + ").data\n", + "print(f\"Dtypes: {polars_df_mixed.dtypes}\")\n", + "print(f\"Category is Categorical: {polars_df_mixed['category'].dtype == pl.Categorical}\")\n", + "print(f\"Description is String: {polars_df_mixed['description'].dtype == pl.String}\")" + ] + }, + { + "cell_type": "markdown", + "id": "eef6090c", + "metadata": {}, + "source": [ + "# Pandas Interoperability and Column Naming\n", + "\n", + "Since ArcticDB currently only supports writing Pandas DataFrames, understanding how Pandas metadata translates to Arrow/Polars is important. ArcticDB attaches normalization metadata to enable seamless round-trip conversion." + ] + }, + { + "cell_type": "markdown", + "id": "e1178eae", + "metadata": {}, + "source": [ + "## Index Column Naming: `__index__`\n", + "\n", + "When Pandas DataFrames have unnamed indexes, PyArrow/Polars output formats use the special column name `__index__`:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fde6f9fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Columns: ['__index__', 'value']\n", + "First column name: '__index__'\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 2)
__index__value
datetime[ns]i64
2025-01-01 00:00:001
2025-01-02 00:00:002
2025-01-03 00:00:003
" + ], + "text/plain": [ + "shape: (3, 2)\n", + "┌─────────────────────┬───────┐\n", + "│ __index__ ┆ value │\n", + "│ --- ┆ --- │\n", + "│ datetime[ns] ┆ i64 │\n", + "╞═════════════════════╪═══════╡\n", + "│ 2025-01-01 00:00:00 ┆ 1 │\n", + "│ 2025-01-02 00:00:00 ┆ 2 │\n", + "│ 2025-01-03 00:00:00 ┆ 3 │\n", + "└─────────────────────┴───────┘" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# DataFrame with unnamed DatetimeIndex\n", + "df_unnamed_idx = pd.DataFrame(\n", + " {\"value\": [1, 2, 3]},\n", + " index=pd.date_range(\"2025-01-01\", periods=3)\n", + ")\n", + "lib.write(\"unnamed_idx\", df_unnamed_idx)\n", + "\n", + "polars_df = lib.read(\"unnamed_idx\", output_format=OutputFormat.POLARS).data\n", + "print(f\"Columns: {polars_df.columns}\")\n", + "print(f\"First column name: '{polars_df.columns[0]}'\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "201076be", + "metadata": {}, + "source": [ + "## Named Indexes Preserved\n", + "\n", + "Named indexes retain their names as column names:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d01b5e8f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First column: 'timestamp' (the named index)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (3, 3)
timestampcol_intcol_float
datetime[ns]i64f64
2025-01-01 00:00:00010.0
2025-01-02 00:00:00111.0
2025-01-03 00:00:00212.0
" + ], + "text/plain": [ + "shape: (3, 3)\n", + "┌─────────────────────┬─────────┬───────────┐\n", + "│ timestamp ┆ col_int ┆ col_float │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ datetime[ns] ┆ i64 ┆ f64 │\n", + "╞═════════════════════╪═════════╪═══════════╡\n", + "│ 2025-01-01 00:00:00 ┆ 0 ┆ 10.0 │\n", + "│ 2025-01-02 00:00:00 ┆ 1 ┆ 11.0 │\n", + "│ 2025-01-03 00:00:00 ┆ 2 ┆ 12.0 │\n", + "└─────────────────────┴─────────┴───────────┘" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We already wrote \"timeseries\" with a named index \"timestamp\"\n", + "polars_df = lib.read(\"timeseries\", output_format=OutputFormat.POLARS).data\n", + "print(f\"First column: '{polars_df.columns[0]}' (the named index)\")\n", + "polars_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "7a6aa61d", + "metadata": {}, + "source": [ + "## MultiIndex Handling\n", + "\n", + "MultiIndex levels become separate columns:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "b4d9c5db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Columns: ['date', 'ticker', 'price']\n", + "MultiIndex levels became regular columns\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (4, 3)
datetickerprice
strstri64
"2025-01-01""AAPL"100
"2025-01-01""GOOGL"101
"2025-01-02""AAPL"102
"2025-01-02""GOOGL"103
" + ], + "text/plain": [ + "shape: (4, 3)\n", + "┌────────────┬────────┬───────┐\n", + "│ date ┆ ticker ┆ price │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ i64 │\n", + "╞════════════╪════════╪═══════╡\n", + "│ 2025-01-01 ┆ AAPL ┆ 100 │\n", + "│ 2025-01-01 ┆ GOOGL ┆ 101 │\n", + "│ 2025-01-02 ┆ AAPL ┆ 102 │\n", + "│ 2025-01-02 ┆ GOOGL ┆ 103 │\n", + "└────────────┴────────┴───────┘" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_multi = pd.DataFrame({\n", + " \"price\": [100, 101, 102, 103]\n", + "}, index=pd.MultiIndex.from_product([[\"2025-01-01\", \"2025-01-02\"], [\"AAPL\", \"GOOGL\"]], names=[\"date\", \"ticker\"]))\n", + "lib.write(\"multiindex\", df_multi)\n", + "\n", + "polars_df = lib.read(\"multiindex\", output_format=OutputFormat.POLARS).data\n", + "print(f\"Columns: {polars_df.columns}\")\n", + "print(\"MultiIndex levels became regular columns\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "53a8dfa3", + "metadata": {}, + "source": [ + "Unnamed MultiIndex columns get displayed as `\"__index_level_0__\"`, `\"__index_level_1__\"`, etc." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d48ff8df", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Columns: ['__index_level_0__', '__index_level_1__', 'price']\n", + "MultiIndex levels became regular columns and use special names when unnamed\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (4, 3)
__index_level_0____index_level_1__price
strstri64
"2025-01-01""AAPL"100
"2025-01-01""GOOGL"101
"2025-01-02""AAPL"102
"2025-01-02""GOOGL"103
" + ], + "text/plain": [ + "shape: (4, 3)\n", + "┌───────────────────┬───────────────────┬───────┐\n", + "│ __index_level_0__ ┆ __index_level_1__ ┆ price │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ i64 │\n", + "╞═══════════════════╪═══════════════════╪═══════╡\n", + "│ 2025-01-01 ┆ AAPL ┆ 100 │\n", + "│ 2025-01-01 ┆ GOOGL ┆ 101 │\n", + "│ 2025-01-02 ┆ AAPL ┆ 102 │\n", + "│ 2025-01-02 ┆ GOOGL ┆ 103 │\n", + "└───────────────────┴───────────────────┴───────┘" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_multi = pd.DataFrame({\n", + " \"price\": [100, 101, 102, 103]\n", + "}, index=pd.MultiIndex.from_product([[\"2025-01-01\", \"2025-01-02\"], [\"AAPL\", \"GOOGL\"]]))\n", + "lib.write(\"multiindex\", df_multi)\n", + "\n", + "polars_df = lib.read(\"multiindex\", output_format=OutputFormat.POLARS).data\n", + "print(f\"Columns: {polars_df.columns}\")\n", + "print(\"MultiIndex levels became regular columns and use special names when unnamed\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "9157ac73", + "metadata": {}, + "source": [ + "## Round-Trip Conversion\n", + "\n", + "Converting to PyArrow and back preserves Pandas metadata:\n", + "\n", + "*Note that Polars does not have a concept of Pandas metadata and can't be round tripped to pandas without loosing index metadata.*" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "6552652d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index restored: \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col1
00
11
22
33
44
\n", + "
" + ], + "text/plain": [ + " col1\n", + "0 0\n", + "1 1\n", + "2 2\n", + "3 3\n", + "4 4" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Default RangeIndex is restored\n", + "df_simple = pd.DataFrame({\"col1\": np.arange(5)})\n", + "lib.write(\"simple_pandas\", df_simple)\n", + "\n", + "arrow_table = lib.read(\"simple_pandas\", output_format=OutputFormat.PYARROW).data\n", + "pandas_restored = arrow_table.to_pandas()\n", + "print(f\"Index restored: {type(pandas_restored.index)}\")\n", + "pandas_restored" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "af571178", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MultiIndex restored: \n", + "Index names: [None, None]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
2025-01-01AAPL100
GOOGL101
2025-01-02AAPL102
GOOGL103
\n", + "
" + ], + "text/plain": [ + " price\n", + "2025-01-01 AAPL 100\n", + " GOOGL 101\n", + "2025-01-02 AAPL 102\n", + " GOOGL 103" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# MultiIndex is also restored correctly\n", + "arrow_table = lib.read(\"multiindex\", output_format=OutputFormat.PYARROW).data\n", + "pandas_restored = arrow_table.to_pandas()\n", + "print(f\"MultiIndex restored: {type(pandas_restored.index)}\")\n", + "print(f\"Index names: {pandas_restored.index.names}\")\n", + "pandas_restored" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "1b08909f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type: \n", + "Note: Index information is lost when converting from Polars to Pandas\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampcol_intcol_float
02025-01-01010.0
12025-01-02111.0
22025-01-03212.0
32025-01-04313.0
42025-01-05414.0
\n", + "
" + ], + "text/plain": [ + " timestamp col_int col_float\n", + "0 2025-01-01 0 10.0\n", + "1 2025-01-02 1 11.0\n", + "2 2025-01-03 2 12.0\n", + "3 2025-01-04 3 13.0\n", + "4 2025-01-05 4 14.0" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "polars_df = lib.read(\"timeseries\", output_format=OutputFormat.POLARS).data\n", + "pandas_from_polars = polars_df.to_pandas()\n", + "print(f\"Type: {type(pandas_from_polars)}\")\n", + "print(\"Note: Index information is lost when converting from Polars to Pandas\")\n", + "pandas_from_polars.head()" + ] + }, + { + "cell_type": "markdown", + "id": "26e50f31-7545-4a85-a15f-1b81548ebb7f", + "metadata": {}, + "source": [ + "# Working with Dynamic Schema\n", + "\n", + "Dynamic schema libraries work seamlessly with Arrow output formats, including processing operations." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "572d3934-bc87-4b27-9a2c-1abd739ab4bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Promoted dtype: Int16\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (6, 1)
col1
i16
0
1
2
3
4
5
" + ], + "text/plain": [ + "shape: (6, 1)\n", + "┌──────┐\n", + "│ col1 │\n", + "│ --- │\n", + "│ i16 │\n", + "╞══════╡\n", + "│ 0 │\n", + "│ 1 │\n", + "│ 2 │\n", + "│ 3 │\n", + "│ 4 │\n", + "│ 5 │\n", + "└──────┘" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a dynamic schema library\n", + "ac.delete_library(\"arrow_dynamic\")\n", + "lib_dyn = ac.create_library(\"arrow_dynamic\", LibraryOptions(dynamic_schema=True))\n", + "\n", + "## Type Promotion\n", + "\n", + "df_0 = pd.DataFrame({\"col1\": np.arange(3, dtype=np.uint8)})\n", + "df_1 = pd.DataFrame({\"col1\": np.arange(3, 6, dtype=np.int16)})\n", + "lib_dyn.write(\"type_promo\", df_0)\n", + "lib_dyn.append(\"type_promo\", df_1)\n", + "\n", + "polars_df = lib_dyn.read(\"type_promo\", output_format=OutputFormat.POLARS).data\n", + "print(f\"Promoted dtype: {polars_df['col1'].dtype}\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "33bd8187-7598-4ae1-aa5a-4f635a5d83d4", + "metadata": {}, + "source": [ + "## Missing Columns\n", + "\n", + "Columns missing from some segments are backfilled with nulls:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4a80d248-549c-4cd8-8b36-c0a082fa8055", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing columns filled with nulls:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (4, 2)
col_acol_b
i64str
1null
2null
null"x"
null"y"
" + ], + "text/plain": [ + "shape: (4, 2)\n", + "┌───────┬───────┐\n", + "│ col_a ┆ col_b │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════╪═══════╡\n", + "│ 1 ┆ null │\n", + "│ 2 ┆ null │\n", + "│ null ┆ x │\n", + "│ null ┆ y │\n", + "└───────┴───────┘" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_0 = pd.DataFrame({\"col_a\": [1, 2]})\n", + "df_1 = pd.DataFrame({\"col_b\": [\"x\", \"y\"]})\n", + "lib_dyn.write(\"missing_cols\", df_0)\n", + "lib_dyn.append(\"missing_cols\", df_1)\n", + "\n", + "polars_df = lib_dyn.read(\"missing_cols\", output_format=OutputFormat.POLARS).data\n", + "print(\"Missing columns filled with nulls:\")\n", + "polars_df" + ] + }, + { + "cell_type": "markdown", + "id": "69e0d041", + "metadata": {}, + "source": [ + "## Processing with Dynamic Schema\n", + "\n", + "Processing operations work with both static and dynamic schema libraries.\n", + "All missing values from processing are filled with nulls:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "74c6cb24", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result type: \n", + "Filtered rows: 12\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (12, 4)
valuefloat_1float_2combined_float
i64f64f64f64
-10.0null0.0
21.0nullnull
-13.0null3.0
24.0nullnull
-16.0null6.0
8null1.01.0
-5null3.0null
8null4.04.0
-5null6.0null
8null7.07.0
" + ], + "text/plain": [ + "shape: (12, 4)\n", + "┌───────┬─────────┬─────────┬────────────────┐\n", + "│ value ┆ float_1 ┆ float_2 ┆ combined_float │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════╪═════════╪═════════╪════════════════╡\n", + "│ -1 ┆ 0.0 ┆ null ┆ 0.0 │\n", + "│ 2 ┆ 1.0 ┆ null ┆ null │\n", + "│ -1 ┆ 3.0 ┆ null ┆ 3.0 │\n", + "│ 2 ┆ 4.0 ┆ null ┆ null │\n", + "│ -1 ┆ 6.0 ┆ null ┆ 6.0 │\n", + "│ … ┆ … ┆ … ┆ … │\n", + "│ 8 ┆ null ┆ 1.0 ┆ 1.0 │\n", + "│ -5 ┆ null ┆ 3.0 ┆ null │\n", + "│ 8 ┆ null ┆ 4.0 ┆ 4.0 │\n", + "│ -5 ┆ null ┆ 6.0 ┆ null │\n", + "│ 8 ┆ null ┆ 7.0 ┆ 7.0 │\n", + "└───────┴─────────┴─────────┴────────────────┘" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_proc_1 = pd.DataFrame({\"value\": [-1, 2, 10] * 3, \"float_1\": np.arange(9, dtype=np.float64)})\n", + "df_proc_2 = pd.DataFrame({\"value\": [-5, 8, 20] * 3, \"float_2\": np.arange(9, dtype=np.float64)})\n", + "lib_dyn.write(\"missing_cols\", df_proc_1)\n", + "lib_dyn.append(\"missing_cols\", df_proc_2)\n", + "\n", + "# Use lazy processing with Polars output\n", + "lazy_df = lib_dyn.read(\"missing_cols\", lazy=True, output_format=OutputFormat.POLARS)\n", + "lazy_df = lazy_df[lazy_df[\"value\"] < 10] # Filter based on column values\n", + "lazy_df[\"combined_float\"] = where(lazy_df[\"value\"] < 0, lazy_df[\"float_1\"], lazy_df[\"float_2\"]) # Project a column with interleaved nulls\n", + "result = lazy_df.collect().data\n", + "\n", + "print(f\"Result type: {type(result)}\")\n", + "print(f\"Filtered rows: {len(result)}\")\n", + "result" + ] + }, + { + "cell_type": "markdown", + "id": "bfd7ed36", + "metadata": {}, + "source": [ + "# Performance Benchmarks\n", + "\n", + "Arrow-based formats provide performance improvements for string-heavy data (because with Arrow we don't need to take the GIL).\n", + "\n", + "For numeric data arrow based formats are in line with pandas performance." + ] + }, + { + "cell_type": "markdown", + "id": "83751c3c", + "metadata": {}, + "source": [ + "## Benchmark: Numeric Data" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "1bcb7858", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pandas: 466.19 ms\n", + "Polars: 441.10 ms\n", + "Speedup: 1.06x\n" + ] + } + ], + "source": [ + "import timeit\n", + "\n", + "# Create numeric dataset\n", + "df_numeric = pd.DataFrame({\n", + " f\"col_{i}\": np.random.randn(10_000_000) for i in range(10)\n", + "})\n", + "lib.write(\"bench_numeric\", df_numeric)\n", + "\n", + "# Benchmark\n", + "pandas_time = timeit.timeit(\n", + " lambda: lib.read(\"bench_numeric\", output_format=OutputFormat.PANDAS).data,\n", + " number=10\n", + ") / 10\n", + "\n", + "polars_time = timeit.timeit(\n", + " lambda: lib.read(\"bench_numeric\", output_format=OutputFormat.POLARS).data,\n", + " number=10\n", + ") / 10\n", + "\n", + "print(f\"Pandas: {pandas_time*1000:.2f} ms\")\n", + "print(f\"Polars: {polars_time*1000:.2f} ms\")\n", + "print(f\"Speedup: {pandas_time/polars_time:.2f}x\")" + ] + }, + { + "cell_type": "markdown", + "id": "90c77511", + "metadata": {}, + "source": [ + "## Benchmark: String Data\n", + "\n", + "String performance shows big improvements (no GIL required for Arrow):" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "dece965c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pandas: 1662.48 ms\n", + "Polars: 694.08 ms\n", + "Speedup: 2.40x\n" + ] + } + ], + "source": [ + "\n", + "df_strings = pd.DataFrame({\n", + " f\"col_{i}\": np.random.randn(1_000_000).astype(str) for i in range(10)\n", + "})\n", + "lib.write(\"bench_strings\", df_strings)\n", + "\n", + "# Benchmark\n", + "pandas_time = timeit.timeit(\n", + " lambda: lib.read(\"bench_strings\", output_format=OutputFormat.PANDAS).data,\n", + " number=10\n", + ") / 10\n", + "\n", + "polars_time = timeit.timeit(\n", + " lambda: lib.read(\"bench_strings\", output_format=OutputFormat.POLARS).data,\n", + " number=10\n", + ") / 10\n", + "\n", + "print(f\"Pandas: {pandas_time*1000:.2f} ms\")\n", + "print(f\"Polars: {polars_time*1000:.2f} ms\")\n", + "print(f\"Speedup: {pandas_time/polars_time:.2f}x\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "311-adb-editable", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/.asv/results/benchmarks.json b/python/.asv/results/benchmarks.json index 0c58f1e39b..1409507a27 100644 --- a/python/.asv/results/benchmarks.json +++ b/python/.asv/results/benchmarks.json @@ -1,6 +1,6 @@ { "arrow.ArrowNumeric.peakmem_read": { - "code": "class ArrowNumeric:\n def peakmem_read(self, rows, date_range):\n self.lib.read(self.symbol_name(rows), date_range=self.date_range)\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", + "code": "class ArrowNumeric:\n def peakmem_read(self, rows, date_range):\n self.lib.read(self.symbol_name(rows), date_range=self.date_range)\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", "name": "arrow.ArrowNumeric.peakmem_read", "param_names": [ "rows", @@ -20,10 +20,10 @@ "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "67ab80e297890f213706b5160255c969b0aee853adfd6ae53655c45b7e845f85" + "version": "aee945c1358b8a4c2dd274815ceb80a3cfaa085451149386f6674020b904feed" }, "arrow.ArrowNumeric.peakmem_write": { - "code": "class ArrowNumeric:\n def peakmem_write(self, rows, date_range):\n self.fresh_lib.write(f\"sym_{rows}\", self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", + "code": "class ArrowNumeric:\n def peakmem_write(self, rows, date_range):\n self.fresh_lib.write(f\"sym_{rows}\", self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", "name": "arrow.ArrowNumeric.peakmem_write", "param_names": [ "rows", @@ -43,10 +43,10 @@ "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "b28a54936bacb902a56cad0b9d235bd3e13ac04ac37687cf265c5a987b7ea2d1" + "version": "87b2ccbc5cf2ef33d45f934f653aa03f853a30fbadfa4710abb2018124b53386" }, "arrow.ArrowNumeric.time_read": { - "code": "class ArrowNumeric:\n def time_read(self, rows, date_range):\n self.lib.read(self.symbol_name(rows), date_range=self.date_range)\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", + "code": "class ArrowNumeric:\n def time_read(self, rows, date_range):\n self.lib.read(self.symbol_name(rows), date_range=self.date_range)\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", "min_run_count": 2, "name": "arrow.ArrowNumeric.time_read", "number": 20, @@ -71,11 +71,11 @@ "timeout": 6000, "type": "time", "unit": "seconds", - "version": "ecb51f067ac7d40c4b9c9df11fa1a0b1f3c682a14f1ee974cf8c3eb9fcfd86d7", + "version": "aaf529494b84152ccaca0f605f0f2a9bfad037367e50e96ae5d906ce6b26741c", "warmup_time": 0 }, "arrow.ArrowNumeric.time_write": { - "code": "class ArrowNumeric:\n def time_write(self, rows, date_range):\n self.fresh_lib.write(f\"sym_{rows}\", self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", + "code": "class ArrowNumeric:\n def time_write(self, rows, date_range):\n self.fresh_lib.write(f\"sym_{rows}\", self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = pa.Table.from_pandas(generate_pseudo_random_dataframe(rows))\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", "min_run_count": 2, "name": "arrow.ArrowNumeric.time_write", "number": 20, @@ -100,11 +100,11 @@ "timeout": 6000, "type": "time", "unit": "seconds", - "version": "b0a49f584f44c8451a2fda903df76351a10368b81d639239cfa3023e9dee737b", + "version": "3511ddd99480aff7cc0c4c13d2809e213c523fdbe74137ce79f7483297e5686f", "warmup_time": 0 }, "arrow.ArrowStrings.peakmem_read": { - "code": "class ArrowStrings:\n def peakmem_read(self, rows, date_range, unique_string_count, arrow_string_format):\n self.lib.read(\n self.symbol_name(rows, unique_string_count),\n date_range=self.date_range,\n arrow_string_format_default=arrow_string_format,\n )\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", + "code": "class ArrowStrings:\n def peakmem_read(self, rows, date_range, unique_string_count, arrow_string_format):\n self.lib.read(\n self.symbol_name(rows, unique_string_count),\n date_range=self.date_range,\n arrow_string_format_default=arrow_string_format,\n )\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", "name": "arrow.ArrowStrings.peakmem_read", "param_names": [ "rows", @@ -137,10 +137,10 @@ "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "a273a5f3619b20f832ece06f1ee4650c4c4bb76e6de07702968a2098d39a1bab" + "version": "5bd5060492fc565ecb99ad5215bf454d71ca4cfedcd6f970efd61e04bec1fb7d" }, "arrow.ArrowStrings.peakmem_write": { - "code": "class ArrowStrings:\n def peakmem_write(self, rows, date_range, unique_string_count, arrow_string_format):\n # No point in running with all read time options\n if date_range is None and arrow_string_format == ArrowOutputStringFormat.CATEGORICAL:\n self.fresh_lib.write(self.symbol_name(rows, unique_string_count), self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", + "code": "class ArrowStrings:\n def peakmem_write(self, rows, date_range, unique_string_count, arrow_string_format):\n # No point in running with all read time options\n if date_range is None and arrow_string_format == ArrowOutputStringFormat.CATEGORICAL:\n self.fresh_lib.write(self.symbol_name(rows, unique_string_count), self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", "name": "arrow.ArrowStrings.peakmem_write", "param_names": [ "rows", @@ -173,10 +173,10 @@ "timeout": 6000, "type": "peakmemory", "unit": "bytes", - "version": "d1fa54f3f898dac7bbb27a41353877938711c2470afc8d5ad170bd8583f352d4" + "version": "d473ffa788d420e51973cdce360a1537aee61660dd9e06937a494ac93cc88597" }, "arrow.ArrowStrings.time_read": { - "code": "class ArrowStrings:\n def time_read(self, rows, date_range, unique_string_count, arrow_string_format):\n self.lib.read(\n self.symbol_name(rows, unique_string_count),\n date_range=self.date_range,\n arrow_string_format_default=arrow_string_format,\n )\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", + "code": "class ArrowStrings:\n def time_read(self, rows, date_range, unique_string_count, arrow_string_format):\n self.lib.read(\n self.symbol_name(rows, unique_string_count),\n date_range=self.date_range,\n arrow_string_format_default=arrow_string_format,\n )\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", "min_run_count": 2, "name": "arrow.ArrowStrings.time_read", "number": 20, @@ -214,11 +214,11 @@ "timeout": 6000, "type": "time", "unit": "seconds", - "version": "8b5591c39110f8a625f84894f4f49f6e757449018c652c5644b63f94a4ea95ea", + "version": "b1977c6215c99d47d950dc30637270fadbf49c504f388ef9360eba4ea558a0ab", "warmup_time": 0 }, "arrow.ArrowStrings.time_write": { - "code": "class ArrowStrings:\n def time_write(self, rows, date_range, unique_string_count, arrow_string_format):\n # No point in running with all read time options\n if date_range is None and arrow_string_format == ArrowOutputStringFormat.CATEGORICAL:\n self.fresh_lib.write(self.symbol_name(rows, unique_string_count), self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", + "code": "class ArrowStrings:\n def time_write(self, rows, date_range, unique_string_count, arrow_string_format):\n # No point in running with all read time options\n if date_range is None and arrow_string_format == ArrowOutputStringFormat.CATEGORICAL:\n self.fresh_lib.write(self.symbol_name(rows, unique_string_count), self.table, index_column=\"ts\")\n\n def setup(self, rows, date_range, unique_string_count, arrow_string_format):\n self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW)\n self.lib = self.ac.get_library(self.lib_name_prewritten)\n self.lib._nvs._set_allow_arrow_input()\n if date_range is None:\n self.date_range = None\n else:\n # Create a date range that excludes the first and last 10 rows of the data only\n self.date_range = (pd.Timestamp(10), pd.Timestamp(rows - 10))\n self.fresh_lib = self.get_fresh_lib()\n self.fresh_lib._nvs._set_allow_arrow_input()\n self.table = self._generate_table(rows, self.num_cols, unique_string_count)\n\n def setup_cache(self):\n start = time.time()\n self._setup_cache()\n self.logger.info(f\"SETUP_CACHE TIME: {time.time() - start}\")", "min_run_count": 2, "name": "arrow.ArrowStrings.time_write", "number": 20, @@ -256,7 +256,7 @@ "timeout": 6000, "type": "time", "unit": "seconds", - "version": "05391bc18c869bb06480f5117f87ac7cc6a90d50acd8405162c9e935f0de14b0", + "version": "13495c9c867964a946faa9f0877d374bafcae7e1a524eacefb708a7f8fa10e92", "warmup_time": 0 }, "basic_functions.BasicFunctions.peakmem_read": { diff --git a/python/arcticdb/arctic.py b/python/arcticdb/arctic.py index 053dff77c1..2849b41e31 100644 --- a/python/arcticdb/arctic.py +++ b/python/arcticdb/arctic.py @@ -81,17 +81,14 @@ def __init__( Can be overridden by specifying the encoding version in the LibraryOptions argument to create_library. output_format: Union[OutputFormat, str], default = OutputFormat.PANDAS - Controls the default output format of all operations returning a dataframe. - The default behavior (OutputFormat.PANDAS) is to return `pandas.DataFrame`s or `pandas.Series` backed by - numpy arrays. - OutputFormat.EXPERIMENTAL_ARROW will return all dataframes as `pyarrow.Table`s. The arrow API is still - experimental and the arrow layout might change in a minor release. - Accepts the OutputFormat as either OutputFormat enum values or as case-insensitive strings like "pandas" - and "experimental_arrow". - - arrow_string_format_default: Union[ArrowOutputStringFormat, "pa.DataType"] = ArrowOutputStringFormat.LARGE_STRING - Controls the default string format used for `OutputFormat.EXPERIMENTAL_ARROW`. - See documentation of `ArrowOutputStringFormat` for more information on the different options. + Default output format for all read operations on libraries created from this `Arctic` instance. + Can be overridden per library or per read operation. + See `OutputFormat` documentation for details on available formats. + + arrow_string_format_default: Union[ArrowOutputStringFormat, "pa.DataType"], default = ArrowOutputStringFormat.LARGE_STRING + Default string column format when using `PYARROW` or `POLARS` output formats. + Can be overridden per library or per read operation. + See `ArrowOutputStringFormat` documentation for details on available string formats. Examples -------- @@ -196,14 +193,16 @@ def get_library( Unused if create_if_missing is False. output_format: Optional[Union[OutputFormat, str]], default = None - Controls the default output format of all operations on the library returning a dataframe. - For more information see documentation of `Arctic.__init__`. - If `None` uses the output format from the Arctic instance. + Default output format for all read operations on this library. + If `None`, uses the output format from the `Arctic` instance. + Can be overridden per read operation. + See `OutputFormat` documentation for details on available formats. arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None - If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - If `None` uses the default arrow_string_format from the `Library` instance. + Default string column format when using `PYARROW` or `POLARS` output formats on this library. + If `None`, uses the `arrow_string_format_default` from the `Arctic` instance. + Can be overridden per read operation. + See `ArrowOutputStringFormat` documentation for details on available string formats. Examples -------- @@ -268,14 +267,17 @@ def create_library( EnterpriseLibraryOptions. These options are only relevant to ArcticDB enterprise users. output_format: Optional[Union[OutputFormat, str]], default = None - Controls the default output format of all operations on the library returning a dataframe. - For more information see documentation of `Arctic.__init__`. - If `None` uses the output format from the Arctic instance. + Default output format for all read operations on this library. + If `None`, uses the output format from the `Arctic` instance. + Can be overridden per read operation. + See `OutputFormat` documentation for details on available formats. arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None - If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - If `None` uses the default arrow_string_format from the `Library` instance. + Default string column format when using `PYARROW` or `POLARS` output formats on this library. + If `None`, uses the `arrow_string_format_default` from the `Arctic` instance. + Can be overridden per read operation. + See `ArrowOutputStringFormat` documentation for details on available string formats. + Note that this setting is only applied to the runtime `Library` instance and is not stored as part of the library configuration. Examples -------- diff --git a/python/arcticdb/options.py b/python/arcticdb/options.py index 0b22077c80..66899aac0c 100644 --- a/python/arcticdb/options.py +++ b/python/arcticdb/options.py @@ -169,21 +169,40 @@ def __repr__(self): # TODO: Use enum.StrEnum when we no longer need to support python 3.9 class OutputFormat(str, Enum): + """ + Controls the output format of operations which return dataframes. All APIs which take an `output_format` argument + accept the enum values and case-insensitive strings. E.g. all of `OutputFormat.PYARROW`, `"PYARROW"`, `"pyarrow"` + will be interpreted as `OutputFormat.PYARROW`. + + PANDAS (default): + Dataframes are returned as `pandas.DataFrame` or `pandas.Series` objects backed by numpy arrays. + + PYARROW: + Dataframes are returned as `pyarrow.Table` objects using Apache Arrow's columnar memory format. + Provides better performance than `PANDAS`, especially for dataframes containing many string columns. + String format can be customized via `ArrowOutputStringFormat`. + + POLARS: + Dataframes are returned as `polars.DataFrame` objects using Apache Arrow's columnar memory format. + Provides better performance than `PANDAS`, especially for dataframes containing many string columns. + String format can be customized via `ArrowOutputStringFormat`. + """ + PANDAS = "PANDAS" - EXPERIMENTAL_ARROW = "EXPERIMENTAL_ARROW" - EXPERIMENTAL_POLARS = "EXPERIMENTAL_POLARS" + PYARROW = "PYARROW" + POLARS = "POLARS" def output_format_to_internal(output_format: Union[OutputFormat, str]) -> InternalOutputFormat: if output_format.lower() == OutputFormat.PANDAS.lower(): return InternalOutputFormat.PANDAS - elif output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower(): + elif output_format.lower() == OutputFormat.PYARROW.lower(): if not _PYARROW_AVAILABLE: raise ModuleNotFoundError( "ArcticDB's pyarrow optional dependency missing but is required to use arrow output format." ) return InternalOutputFormat.ARROW - elif output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower(): + elif output_format.lower() == OutputFormat.POLARS.lower(): if not _PYARROW_AVAILABLE or not _POLARS_AVAILABLE: raise ModuleNotFoundError( "ArcticDB's pyarrow or polars optional dependencies are missing but are required to use polars output format." @@ -195,24 +214,31 @@ def output_format_to_internal(output_format: Union[OutputFormat, str]) -> Intern class ArrowOutputStringFormat(str, Enum): """ - Used to specify string format when output_format=OutputFormat.EXPERIMENTAL_ARROW. - Arguments allow specifying either the enum value or the corresponding pyarrow.DataType + Controls the string column format when using `PYARROW` or `POLARS` output formats. + Accepts either the enum value or the corresponding `pyarrow.DataType`. LARGE_STRING (default): - Produces string columns with type `pa.large_string()`. Total length of strings must fit in a 64-bit integer. - Does not deduplicate strings, so has better performance for columns with many unique strings. + Uses 64-bit variable-size encoding. + PyArrow: `pa.large_string()`, Polars: `pl.String` + Supports up to 2⁶³-1 bytes total string length per Arrow array. + Best for general-purpose use and when working with large string data. SMALL_STRING: - Produces string columns with type `pa.string()`. Total length of strings must fit in a 32-bit integer. - Does not deduplicate strings, so has better performance for columns with many unique strings. - Slightly faster than `LARGE_STRING` but does not work with very long strings. + Uses 32-bit variable-size encoding. + PyArrow: `pa.string()`, Polars: Not supported + Supports up to 2³¹-1 bytes total string length per Arrow array. + Only supported with PyArrow because Polars does not support small strings. + Slightly more memory efficient than `LARGE_STRING` when string data is known to be small. CATEGORICAL and DICTIONARY_ENCODED: - Both are different aliases for the same string format. Produces string columns with type - `pa.dictionary(pa.int32(), pa.large_string())`. Total length of strings must fit in a 64-bit integer. Splitting in - record batches guarantees that 32-bit dictionary keys are sufficient. - Does deduplicate strings, so has better performance for columns with few unique strings. - + Both are aliases for dictionary-encoded strings with int32 indices. + PyArrow: `pa.dictionary(pa.int32(), pa.large_string())`, Polars: `pl.Categorical` + Best for columns with low cardinality (few unique values repeated many times). + Deduplicates strings, reducing memory usage and improving performance when the number of + unique values is much smaller than the total number of rows. + + For more details on physical layouts, see the Apache Arrow specification: + https://arrow.apache.org/docs/format/Columnar.html """ CATEGORICAL = "CATEGORICAL" @@ -222,7 +248,7 @@ class ArrowOutputStringFormat(str, Enum): def arrow_output_string_format_to_internal( - arrow_string_format: Union[ArrowOutputStringFormat, "pa.DataType"], + arrow_string_format: Union[ArrowOutputStringFormat, "pa.DataType"], output_format: Union[OutputFormat, str] ) -> InternalArrowOutputStringFormat: if ( arrow_string_format == ArrowOutputStringFormat.CATEGORICAL @@ -242,6 +268,10 @@ def arrow_output_string_format_to_internal( or _PYARROW_AVAILABLE and arrow_string_format == pa.string() ): + if output_format.lower() == OutputFormat.POLARS.lower(): + raise ValueError( + "SMALL_STRING is not supported with POLARS output format. Please use LARGE_STRING instead." + ) return InternalArrowOutputStringFormat.SMALL_STRING else: raise ValueError(f"Unkown ArrowOutputStringFormat: {arrow_string_format}") diff --git a/python/arcticdb/util/arrow.py b/python/arcticdb/util/arrow.py index ce6b9d2964..d0f1375161 100644 --- a/python/arcticdb/util/arrow.py +++ b/python/arcticdb/util/arrow.py @@ -25,7 +25,7 @@ def stringify_dictionary_encoded_columns(table, string_type=None): def convert_arrow_to_pandas_for_tests(table): """ - Converts `pa.Table` outputted via `output_format=OutputFormat.EXPERIMENTAL_ARROW` to a `pd.DataFrame` so it would + Converts `pa.Table` outputted via `output_format=OutputFormat.PYARROW` to a `pd.DataFrame` so it would be identical to the one outputted via `output_format=OutputFormat.PANDAS`. This requires the following changes: - Replaces dictionary encoded string columns with regular string columns. - Fills null values in int columns with zeros. diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index ad8ffe7c88..a926291fff 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -792,7 +792,6 @@ def generate_original_column_names(): pandas_meta = norm_meta.df.common elif input_type == "series": # For pandas series we always return a dataframe (to not lose the index information). - # TODO: Return a `pyarrow.Array` if index is not physically stored (Monday ref: 9360502457) pandas_meta = norm_meta.series.common elif input_type == "experimental_arrow": if norm_meta.experimental_arrow.has_index: diff --git a/python/arcticdb/version_store/_store.py b/python/arcticdb/version_store/_store.py index f1776517b5..95507b59c5 100644 --- a/python/arcticdb/version_store/_store.py +++ b/python/arcticdb/version_store/_store.py @@ -385,7 +385,7 @@ def _set_allow_arrow_input(self, allow_arrow_input: bool = True): def _set_output_format_for_pipeline_tests(self, output_format): self.set_output_format(output_format) - if output_format == OutputFormat.EXPERIMENTAL_ARROW: + if output_format == OutputFormat.PYARROW: self._test_convert_arrow_back_to_pandas = True @classmethod @@ -1284,21 +1284,17 @@ def batch_read( For more information see the documentation for the QueryBuilder class. i-th entry corresponds to i-th element of `symbols`. arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None - Controls the default string format used for `ARROW` or `POLARS` output format. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - It serves as the default for the entire batch. - arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None, - Controls the string format per column used for `ARROW` or `POLARS` output format. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - It is applied to all symbols which don't have a `per_symbol_arrow_string_format_per_column` set. - per_symbol_arrow_string_format_default: Optional[List[Optional[Union[ArrowOutputStringFormat, "pa.DataType"]]]], default=None, - Controls the string format per column used for `ARROW` or `POLARS` output format. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - It serves as the default per symbol. It overrides the global `arrow_string_format_default` setting - per_symbol_arrow_string_format_per_column: Optional[List[Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]]]], default=None, - Controls the string format per column used for `ARROW` or `POLARS` output format. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - It defines the setting per symbol and per column. It overrides all other string format settings. + String column format when using `PYARROW` or `POLARS` output formats. Serves as the default for the entire batch. + See `ArrowOutputStringFormat` documentation for details on available string formats. + arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None + Per-column overrides for `arrow_string_format_default`. Keys are column names. + Applied to all symbols without a `per_symbol_arrow_string_format_per_column` set. + per_symbol_arrow_string_format_default: Optional[List[Optional[Union[ArrowOutputStringFormat, "pa.DataType"]]]], default=None + Per-symbol override for `arrow_string_format_default`. Overrides the batch-level default. + i-th entry corresponds to i-th element of `symbols`. + per_symbol_arrow_string_format_per_column: Optional[List[Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]]]], default=None + Per-symbol, per-column overrides. Takes precedence over all other string format settings. + i-th entry corresponds to i-th element of `symbols`. Examples -------- @@ -2159,12 +2155,13 @@ def _get_read_options_and_output_format( proto_cfg, global_default=ArrowOutputStringFormat.LARGE_STRING, **kwargs, - ) + ), + output_format, ) ) read_options.set_arrow_output_per_column_string_format( { - key: arrow_output_string_format_to_internal(value) + key: arrow_output_string_format_to_internal(value, output_format) for key, value in resolve_defaults( "arrow_string_format_per_column", proto_cfg, global_default={}, **kwargs ).items() @@ -2728,8 +2725,8 @@ def _adapt_frame_data(self, frame_data, norm, output_format): ) if self._test_convert_arrow_back_to_pandas: data = convert_arrow_to_pandas_for_tests(data) - if output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower(): - data = pl.from_arrow(data) + if output_format.lower() == OutputFormat.POLARS.lower(): + data = pl.from_arrow(data, rechunk=False) else: data = self._normalizer.denormalize(frame_data, norm) if norm.HasField("custom"): diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index c60fddcd42..5b8ff09b8c 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -2026,17 +2026,17 @@ def read( on `LazyDataFrame` for more details. output_format: Optional[Union[OutputFormat, str]], default=None - Controls the output format of the result dataframe. - For more information see documentation of `Arctic.__init__`. - If `None` uses the default output format from the `Library` instance. + Output format for the returned dataframe. + If `None`, uses the output format from the `Library` instance. + See `OutputFormat` documentation for details on available formats. arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None - If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - If `None` uses the default arrow_string_format from the `Library` instance. + String column format when using `PYARROW` or `POLARS` output formats. + If `None`, uses the `arrow_string_format_default` from the `Library` instance. + See `ArrowOutputStringFormat` documentation for details on available string formats. arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None - Provides per column name overrides for `arrow_string_format_default` + Per-column overrides for `arrow_string_format_default`. Keys are column names. Returns ------- @@ -2063,10 +2063,9 @@ def read( 1 6 2 7 - Passing an output_format can change the resulting dataframe type. E.g. we can use the experimental arrow output - format: + Passing an output_format can change the resulting dataframe type. For example, to return a PyArrow table: - >>> lib.read("symbol", output_format="EXPERIMENTAL_ARROW").data + >>> lib.read("symbol", output_format="PYARROW").data pyarrow.Table column: int64 ---- @@ -2128,19 +2127,20 @@ def read_batch( documentation on `LazyDataFrameCollection` for more details. output_format: Optional[Union[OutputFormat, str]], default=None - Controls the output format of the result dataframes. - For more information see documentation of `Arctic.__init__`. - If `None` uses the default output format from the `Library` instance. + Output format for the returned dataframes. + If `None`, uses the output format from the `Library` instance. + See `OutputFormat` documentation for details on available formats. arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None - If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - It serves as the default for the entire batch. The string format settings inside the `ReadRequest`s will - override this batch level setting. + String column format when using `PYARROW` or `POLARS` output formats. + Serves as the default for the entire batch. String format settings in individual `ReadRequest` objects + override this batch-level setting. + If `None`, uses the `arrow_string_format_default` from the `Library` instance. + See `ArrowOutputStringFormat` documentation for details on available string formats. - arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None, - Provides per column name overrides for `arrow_string_format_default`. It is only applied to symbols which - don't have a `arrow_string_format_per_column` set in their `ReadRequest`. + arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None + Per-column overrides for `arrow_string_format_default`. Keys are column names. + Only applied to symbols that don't have `arrow_string_format_per_column` set in their `ReadRequest`. Returns ------- @@ -2309,17 +2309,17 @@ def read_batch_and_join( individual dataframes, and will be applied to the joined data. output_format: Optional[Union[OutputFormat, str]], default=None - Controls the output format of the result dataframe. - For more information see documentation of `Arctic.__init__`. - If `None` uses the default output format from the `Library` instance. + Output format for the returned joined dataframe. + If `None`, uses the output format from the `Library` instance. + See `OutputFormat` documentation for details on available formats. arrow_string_format_default: Optional[Union[ArrowOutputStringFormat, "pa.DataType"]], default=None - If using `output_format=EXPERIMENTAL_ARROW` it sets the output format of string columns for arrow. - See documentation of `ArrowOutputStringFormat` for more information on the different options. - If `None` uses the default arrow_string_format from the `Library` instance. + String column format when using `PYARROW` or `POLARS` output formats. + If `None`, uses the `arrow_string_format_default` from the `Library` instance. + See `ArrowOutputStringFormat` documentation for details on available string formats. arrow_string_format_per_column: Optional[Dict[str, Union[ArrowOutputStringFormat, "pa.DataType"]]], default=None - Provides per column name overrides for `arrow_string_format_default` + Per-column overrides for `arrow_string_format_default`. Keys are column names. Returns ------- diff --git a/python/benchmarks/arrow.py b/python/benchmarks/arrow.py index 3476722a3f..1c2371c29b 100644 --- a/python/benchmarks/arrow.py +++ b/python/benchmarks/arrow.py @@ -41,7 +41,7 @@ def setup_cache(self): self.logger.info(f"SETUP_CACHE TIME: {time.time() - start}") def _setup_cache(self): - self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW) + self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW) num_rows, date_ranges = self.params num_cols = 9 # 10 including the index column self.ac.delete_library(self.lib_name_prewritten) @@ -63,7 +63,7 @@ def teardown(self, rows, date_range): del self.ac def setup(self, rows, date_range): - self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW) + self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW) self.lib = self.ac.get_library(self.lib_name_prewritten) self.lib._nvs._set_allow_arrow_input() if date_range is None: @@ -126,7 +126,7 @@ def _generate_table(self, num_rows, num_cols, unique_string_count): return pa.Table.from_pandas(df) def _setup_cache(self): - self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW) + self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW) num_rows, date_ranges, unique_string_counts, arrow_string_format = self.params self.ac.delete_library(self.lib_name_prewritten) self.ac.create_library(self.lib_name_prewritten) @@ -145,7 +145,7 @@ def teardown(self, rows, date_range, unique_string_count, arrow_string_format): del self.ac def setup(self, rows, date_range, unique_string_count, arrow_string_format): - self.ac = Arctic(self.connection_string, output_format=OutputFormat.EXPERIMENTAL_ARROW) + self.ac = Arctic(self.connection_string, output_format=OutputFormat.PYARROW) self.lib = self.ac.get_library(self.lib_name_prewritten) self.lib._nvs._set_allow_arrow_input() if date_range is None: diff --git a/python/tests/conftest.py b/python/tests/conftest.py index a795212cb6..4218c9ac06 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -1101,7 +1101,7 @@ def lmdb_version_store_v2(version_store_factory, lib_name) -> NativeVersionStore @pytest.fixture def lmdb_version_store_arrow(lmdb_version_store_v1) -> NativeVersionStore: store = lmdb_version_store_v1 - store.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + store.set_output_format(OutputFormat.PYARROW) store._set_allow_arrow_input() return store @@ -1109,9 +1109,7 @@ def lmdb_version_store_arrow(lmdb_version_store_v1) -> NativeVersionStore: # Explicitly not including `OutputFormat.EXPERIMENTAL_POLARS` as `polars.to_pandas()` is not index aware, so all # `assert_frame_equal_with_arrow` would not work. Also POLARS is just a thin wrapper on top of pyarrow, so testing # just one is sufficent. -@pytest.fixture( - params=[OutputFormat.PANDAS, pytest.param(OutputFormat.EXPERIMENTAL_ARROW, marks=PYARROW_POST_PROCESSING)] -) +@pytest.fixture(params=[OutputFormat.PANDAS, pytest.param(OutputFormat.PYARROW, marks=PYARROW_POST_PROCESSING)]) def any_output_format(request) -> OutputFormat: return request.param @@ -1186,7 +1184,7 @@ def lmdb_version_store_dynamic_schema( @pytest.fixture def lmdb_version_store_dynamic_schema_arrow(lmdb_version_store_dynamic_schema_v1) -> NativeVersionStore: store = lmdb_version_store_dynamic_schema_v1 - store.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + store.set_output_format(OutputFormat.PYARROW) store._set_allow_arrow_input() return store diff --git a/python/tests/integration/arcticdb/test_unicode_strings.py b/python/tests/integration/arcticdb/test_unicode_strings.py index ba3b78d35f..a2f1af505e 100644 --- a/python/tests/integration/arcticdb/test_unicode_strings.py +++ b/python/tests/integration/arcticdb/test_unicode_strings.py @@ -33,9 +33,7 @@ def test_fixed_width_blns(lmdb_version_store, any_arrow_string_format): vit = lib.read(symbol) assert_frame_equal(df, vit.data) # Arrow - received = cast_string_columns( - lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data, pa.string() - ).to_pandas() + received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.PYARROW).data, pa.string()).to_pandas() assert_frame_equal(df, received) @@ -53,7 +51,7 @@ def test_write_blns(lmdb_version_store, any_arrow_string_format): lib._set_allow_arrow_input() table = pa.Table.from_pandas(df) lib.write(symbol, table, index_column="ts") - received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data, pa.string()) + received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.PYARROW).data, pa.string()) assert table.equals(received) @@ -77,7 +75,7 @@ def test_append_blns(lmdb_version_store, any_arrow_string_format): table_second_half = pa.Table.from_pandas(df_second_half) lib.write(symbol, table_first_half, index_column="ts") lib.append(symbol, table_second_half, index_column="ts") - received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data, pa.string()) + received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.PYARROW).data, pa.string()) expected = pa.Table.from_pandas(df) assert expected.equals(received) @@ -105,7 +103,7 @@ def test_update_blns(lmdb_version_store, any_arrow_string_format): table_middle_half = pa.Table.from_pandas(df_middle_half) lib.write(symbol, table_removed_middle, index_column="ts") lib.update(symbol, table_middle_half, index_column="ts") - received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data, pa.string()) + received = cast_string_columns(lib.read(symbol, output_format=OutputFormat.PYARROW).data, pa.string()) expected = pa.Table.from_pandas(df) assert expected.equals(received) @@ -132,7 +130,7 @@ def test_batch_read_blns(lmdb_version_store, any_arrow_string_format): lib._set_allow_arrow_input() tables = [pa.Table.from_pandas(df) for df in dfs] lib.batch_write(symbols, tables, index_column_vector=["ts"] * num_symbols) - res = lib.batch_read(symbols, query_builder=qbs, output_format=OutputFormat.EXPERIMENTAL_ARROW) + res = lib.batch_read(symbols, query_builder=qbs, output_format=OutputFormat.PYARROW) expr = pa.compute.field("ints") > 50 for idx, sym in enumerate(symbols): expected = tables[idx] @@ -165,7 +163,7 @@ def test_recursive_normalizers_blns(lmdb_version_store_v1, any_arrow_string_form table = pa.Table.from_pandas(df) dict_data = {s: table for s in keys} lib.write(symbol, dict_data, recursive_normalizers=True) - received = lib.read(symbol, output_format=OutputFormat.EXPERIMENTAL_ARROW).data + received = lib.read(symbol, output_format=OutputFormat.PYARROW).data for key in keys: assert key in received.keys() assert table.equals(cast_string_columns(received[key], pa.string())) diff --git a/python/tests/unit/arcticdb/test_arrow_api.py b/python/tests/unit/arcticdb/test_arrow_api.py index a74c2fb0e8..135fbfb0f8 100644 --- a/python/tests/unit/arcticdb/test_arrow_api.py +++ b/python/tests/unit/arcticdb/test_arrow_api.py @@ -16,18 +16,18 @@ OutputFormat.PANDAS, "PANDAS", "pandas", - OutputFormat.EXPERIMENTAL_ARROW, - "EXPERIMENTAL_ARROW", - "experimental_arrow", - OutputFormat.EXPERIMENTAL_POLARS, - "EXPERIMENTAL_POLARS", - "experimental_polars", + OutputFormat.PYARROW, + "PYARROW", + "pyarrow", + OutputFormat.POLARS, + "POLARS", + "polars", ] no_str_output_format_args = [ None, OutputFormat.PANDAS, - OutputFormat.EXPERIMENTAL_ARROW, - OutputFormat.EXPERIMENTAL_POLARS, + OutputFormat.PYARROW, + OutputFormat.POLARS, ] @@ -37,9 +37,9 @@ def expected_output_type(arctic_output_format, library_output_format, output_for ) if expected_output_format.lower() == OutputFormat.PANDAS.lower(): return pd.DataFrame - if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_ARROW.lower(): + if expected_output_format.lower() == OutputFormat.PYARROW.lower(): return pa.Table - if expected_output_format.lower() == OutputFormat.EXPERIMENTAL_POLARS.lower(): + if expected_output_format.lower() == OutputFormat.POLARS.lower(): return pl.DataFrame raise ValueError("Unexpected format") @@ -177,7 +177,7 @@ def test_basic_modifications(lmdb_library, allow_arrow_input): lib.write(sym, write_table, index_column="ts") lib.append(sym, append_table, index_column="ts") lib.update(sym, update_table, index_column="ts") - received = lib.read(sym, output_format=OutputFormat.EXPERIMENTAL_ARROW).data + received = lib.read(sym, output_format=OutputFormat.PYARROW).data expected = pa.table( { "col": pa.array([1, 5, 6, 4], pa.int64()), @@ -222,7 +222,7 @@ def test_batch_modifications(lmdb_library, allow_arrow_input): lib.write_batch([WritePayload(sym, write_table, index_column="ts")]) lib.append_batch([WritePayload(sym, append_table, index_column="ts")]) lib.update_batch([UpdatePayload(sym, update_table, index_column="ts")]) - received = lib.read(sym, output_format=OutputFormat.EXPERIMENTAL_ARROW).data + received = lib.read(sym, output_format=OutputFormat.PYARROW).data expected = pa.table( { "col": pa.array([1, 5, 6, 4], pa.int64()), @@ -258,7 +258,7 @@ def test_write_pickle(lmdb_library, batch, allow_arrow_input): lib.write_pickle(sym, table) if allow_arrow_input: assert not lib._nvs.is_symbol_pickled(sym) - received = lib.read(sym, output_format=OutputFormat.EXPERIMENTAL_ARROW).data + received = lib.read(sym, output_format=OutputFormat.PYARROW).data assert table.equals(received) else: assert lib._nvs.is_symbol_pickled(sym) @@ -288,7 +288,7 @@ def test_stage(lmdb_library, allow_arrow_input): lib.stage(sym, table_0, index_column="ts") lib.stage(sym, table_1, index_column="ts") lib.finalize_staged_data(sym) - received = lib.read(sym, output_format=OutputFormat.EXPERIMENTAL_ARROW).data + received = lib.read(sym, output_format=OutputFormat.PYARROW).data expected = pa.table( { "col": pa.array([1, 2, 3, 4], pa.int64()), @@ -312,9 +312,7 @@ def test_stage(lmdb_library, allow_arrow_input): def test_read_arctic_strings( lmdb_storage, lib_name, arctic_str_format, library_str_format, read_str_format_default, read_str_format_per_column ): - ac = lmdb_storage.create_arctic( - output_format=OutputFormat.EXPERIMENTAL_ARROW, arrow_string_format_default=arctic_str_format - ) + ac = lmdb_storage.create_arctic(output_format=OutputFormat.PYARROW, arrow_string_format_default=arctic_str_format) lib = ac.create_library(lib_name, arrow_string_format_default=library_str_format) sym = "sym" df = pd.DataFrame({"col": ["some", "strings", "in", "this", "column"]}) @@ -345,7 +343,7 @@ def test_read_arctic_strings( @pytest.mark.parametrize("lazy", [True, False]) @pytest.mark.parametrize("batch_default", [ArrowOutputStringFormat.SMALL_STRING, None]) def test_read_batch_strings(lmdb_storage, lib_name, lazy, batch_default): - ac = lmdb_storage.create_arctic(output_format=OutputFormat.EXPERIMENTAL_ARROW) + ac = lmdb_storage.create_arctic(output_format=OutputFormat.PYARROW) lib = ac.create_library(lib_name) sym_1, sym_2 = "sym_1", "sym_2" df_1 = pd.DataFrame({"col_1": ["a", "a", "bb"], "col_2": ["x", "y", "z"]}) @@ -380,7 +378,7 @@ def test_read_batch_strings(lmdb_storage, lib_name, lazy, batch_default): @pytest.mark.parametrize("default", [None, ArrowOutputStringFormat.SMALL_STRING]) @pytest.mark.parametrize("per_column", [None, ArrowOutputStringFormat.CATEGORICAL]) def test_read_batch_and_join_strings(lmdb_storage, lib_name, default, per_column): - ac = lmdb_storage.create_arctic(output_format=OutputFormat.EXPERIMENTAL_ARROW) + ac = lmdb_storage.create_arctic(output_format=OutputFormat.PYARROW) lib = ac.create_library(lib_name, library_options=LibraryOptions(dynamic_schema=True)) sym_1, sym_2 = "sym_1", "sym_2" df_1 = pd.DataFrame({"col_1": ["a", "a", "bb"], "col_2": ["x", "y", "z"]}) diff --git a/python/tests/unit/arcticdb/version_store/test_arrow_read.py b/python/tests/unit/arcticdb/version_store/test_arrow_read.py index 3be3276752..ea5c47af5f 100644 --- a/python/tests/unit/arcticdb/version_store/test_arrow_read.py +++ b/python/tests/unit/arcticdb/version_store/test_arrow_read.py @@ -45,7 +45,7 @@ def test_basic_with_index(lmdb_version_store_arrow): def test_basic_small_slices(lmdb_version_store_tiny_segment): lib = lmdb_version_store_tiny_segment - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) df = pd.DataFrame({"x": np.arange(10)}) lib.write("arrow", df) table = lib.read("arrow").data @@ -54,7 +54,7 @@ def test_basic_small_slices(lmdb_version_store_tiny_segment): def test_basic_small_slices_with_index(lmdb_version_store_tiny_segment): lib = lmdb_version_store_tiny_segment - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) df = pd.DataFrame({"x": np.arange(10)}, index=pd.date_range(pd.Timestamp(0), periods=10)) lib.write("arrow", df) table = lib.read("arrow").data @@ -137,7 +137,7 @@ def test_strings_basic(lmdb_version_store_arrow, dynamic_strings, any_arrow_stri @pytest.mark.parametrize("row_range", [None, (2, 3), (2, 4), (2, 5), (2, 6), (3, 4), (3, 5), (3, 6)]) def test_strings_with_nones_and_nans(lmdb_version_store_tiny_segment, row_range, any_arrow_string_format): lib = lmdb_version_store_tiny_segment - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) lib.set_arrow_string_format_default(any_arrow_string_format) # lmdb_version_store_tiny_segment has 2 rows per segment # This column is constructed so that every 2-element permutation of strings, Nones, and NaNs are tested @@ -214,7 +214,7 @@ def test_strings_multiple_segments_and_columns( lmdb_version_store_tiny_segment, dynamic_strings, any_arrow_string_format ): lib = lmdb_version_store_tiny_segment - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) lib.set_arrow_string_format_default(any_arrow_string_format) df = pd.DataFrame( { @@ -245,7 +245,7 @@ def test_date_range_corner_cases(version_store_factory, date_range_start, date_r lib = version_store_factory( segment_row_size=2, column_group_size=2, dynamic_schema=dynamic_schema, dynamic_strings=True ) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) df = pd.DataFrame( data={ "col1": np.arange(7), @@ -281,7 +281,7 @@ def test_date_range_corner_cases(version_store_factory, date_range_start, date_r ) def test_date_range_between_index_values(lmdb_version_store_tiny_segment): lib = lmdb_version_store_tiny_segment - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) df = pd.DataFrame( data={ "col1": np.arange(2), @@ -316,7 +316,7 @@ def test_date_range_empty_result(version_store_factory, date_range_start, dynami lib = version_store_factory( segment_row_size=2, column_group_size=2, dynamic_schema=dynamic_schema, dynamic_strings=True ) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) df = pd.DataFrame( data={ "col1": np.arange(7), @@ -349,7 +349,7 @@ def test_date_range_empty_result(version_store_factory, date_range_start, dynami @pytest.mark.parametrize( "output_format", [ - OutputFormat.EXPERIMENTAL_ARROW, + OutputFormat.PYARROW, pytest.param( OutputFormat.PANDAS, marks=pytest.mark.skipif(IS_PANDAS_ONE, reason="Monday ref: 18013444785"), @@ -473,7 +473,7 @@ def test_row_range_corner_cases(version_store_factory, row_range_start, row_rang lib = version_store_factory( segment_row_size=2, column_group_size=2, dynamic_schema=dynamic_schema, dynamic_strings=True ) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) df = pd.DataFrame( data={ "col1": np.arange(7), @@ -511,7 +511,7 @@ def test_row_range_empty_result(version_store_factory, row_range_start, dynamic_ lib = version_store_factory( segment_row_size=2, column_group_size=2, dynamic_schema=dynamic_schema, dynamic_strings=True ) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) df = pd.DataFrame( data={ "col1": np.arange(7), @@ -541,7 +541,7 @@ def test_row_range_empty_result(version_store_factory, row_range_start, dynamic_ @pytest.mark.parametrize( "output_format", [ - OutputFormat.EXPERIMENTAL_ARROW, + OutputFormat.PYARROW, pytest.param( OutputFormat.PANDAS, marks=pytest.mark.skipif(IS_PANDAS_ONE, reason="Monday ref: 18013444785"), @@ -620,7 +620,7 @@ def test_with_querybuilder(lmdb_version_store_arrow): def test_arrow_layout(lmdb_version_store_tiny_segment): lib = lmdb_version_store_tiny_segment - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) lib_tool = lib.library_tool() num_rows = 100 df = pd.DataFrame( @@ -706,7 +706,7 @@ def test_arrow_dynamic_schema_missing_columns_numeric(version_store_factory, row if rows_per_column == 100_000 and segment_row_size != 100_000: pytest.skip("Slow to write and doesn't tell us anything the other variants do not") lib = version_store_factory(segment_row_size=segment_row_size, dynamic_schema=True) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) sym = "test_arrow_dynamic_schema_missing_columns_numeric" write_table = pa.table({"col1": pa.array([1] * rows_per_column, pa.int64())}) append_table = pa.table({"col2": pa.array([2] * rows_per_column, pa.int32())}) @@ -814,7 +814,7 @@ def test_arrow_dynamic_schema_missing_columns_hypothesis( @pytest.mark.parametrize("dynamic_schema", [True, False]) def test_arrow_sparse_floats_basic(version_store_factory, type, dynamic_schema): lib = version_store_factory(dynamic_schema=dynamic_schema) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) sym = "test_arrow_sparse_floats_basic" table = pa.table({"col": pa.array([1, None, None, 2, None], type)}) assert table.column("col").null_count == 3 @@ -829,7 +829,7 @@ def test_arrow_sparse_floats_basic(version_store_factory, type, dynamic_schema): @pytest.mark.parametrize("dynamic_schema", [True, False]) def test_arrow_sparse_floats_row_sliced(version_store_factory, type, dynamic_schema): lib = version_store_factory(segment_row_size=2, dynamic_schema=dynamic_schema) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) sym = "test_arrow_sparse_floats_row_sliced" table_0 = pa.table({"col": pa.array([1, None], type)}) table_1 = pa.table({"col": pa.array([2, 3], type)}) @@ -850,7 +850,7 @@ def test_arrow_sparse_floats_row_sliced(version_store_factory, type, dynamic_sch @pytest.mark.parametrize("date_range_width", list(range(0, 15))) def test_arrow_sparse_floats_date_range(version_store_factory, dynamic_schema, date_range_start, date_range_width): lib = version_store_factory(segment_row_size=5, dynamic_schema=dynamic_schema) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) sym = "test_arrow_sparse_floats_date_range" table_0 = pa.table({"col": pa.array([1, None, 2, None, 3], pa.float64())}) table_1 = pa.table({"col": pa.array([4, None, None, None, None], pa.float64())}) @@ -874,7 +874,7 @@ def test_arrow_sparse_floats_date_range(version_store_factory, dynamic_schema, d @pytest.mark.parametrize("row_range_width", list(range(0, 15))) def test_arrow_sparse_floats_row_range(version_store_factory, dynamic_schema, row_range_start, row_range_width): lib = version_store_factory(segment_row_size=5, dynamic_schema=dynamic_schema) - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) sym = "test_arrow_sparse_floats_row_range" table_0 = pa.table({"col": pa.array([1, None, 2, None, 3], pa.float64())}) table_1 = pa.table({"col": pa.array([4, None, None, None, None], pa.float64())}) @@ -955,7 +955,7 @@ def test_arrow_dynamic_schema_filtered_column(lmdb_version_store_dynamic_schema_ def test_project_dynamic_schema(lmdb_version_store_dynamic_schema_arrow): lib = lmdb_version_store_dynamic_schema_arrow - lib.set_output_format(OutputFormat.EXPERIMENTAL_ARROW) + lib.set_output_format(OutputFormat.PYARROW) sym = "sym" table_1 = pa.table({"a": pa.array([1, 2])}) table_2 = pa.table({"a": pa.array([3, 4]), "b": pa.array([1, 2])}) @@ -1161,7 +1161,7 @@ def test_arrow_read_batch_with_strings(lmdb_version_store_arrow): def test_polars_basic(lmdb_version_store_arrow): lib = lmdb_version_store_arrow - lib.set_output_format(OutputFormat.EXPERIMENTAL_POLARS) + lib.set_output_format(OutputFormat.POLARS) sym = "polars" df = pd.DataFrame( { @@ -1182,3 +1182,17 @@ def test_polars_basic(lmdb_version_store_arrow): assert result.columns == ["__index__", "col_int", "col_float", "col_bool", "col_str", "col_cat"] assert result.dtypes == [pl.Datetime("ns"), pl.Int64, pl.Float32, pl.Boolean, pl.String, pl.Categorical] assert result.equals(expected) + + +def test_polars_unsupported_small_string(lmdb_version_store_arrow): + lib = lmdb_version_store_arrow + lib.set_output_format(OutputFormat.POLARS) + sym = "polars" + df = pd.DataFrame({"col": ["a", "bb"]}) + lib.write(sym, df) + + with pytest.raises(ValueError): + lib.read(sym, arrow_string_format_default=pa.string()) + + with pytest.raises(ValueError): + lib.read(sym, arrow_string_format_per_column={"col": ArrowOutputStringFormat.SMALL_STRING}).data diff --git a/python/tests/unit/arcticdb/version_store/test_arrow_writes.py b/python/tests/unit/arcticdb/version_store/test_arrow_writes.py index c12babe4c0..36232884b6 100644 --- a/python/tests/unit/arcticdb/version_store/test_arrow_writes.py +++ b/python/tests/unit/arcticdb/version_store/test_arrow_writes.py @@ -168,7 +168,7 @@ def test_write_multiple_record_batches_indexed(lmdb_version_store_arrow): @pytest.mark.parametrize("num_cols", [1, 2, 3, 4, 5]) def test_write_sliced(lmdb_version_store_tiny_segment, num_rows, num_cols): lib = lmdb_version_store_tiny_segment - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() sym = "test_write_sliced" table = pa.table( @@ -200,7 +200,7 @@ def test_write_sliced(lmdb_version_store_tiny_segment, num_rows, num_cols): ) def test_write_bools_sliced(lmdb_version_store_tiny_segment, data): lib = lmdb_version_store_tiny_segment - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() sym = "test_write_bools_sliced" table = pa.table({"col": pa.array(data)}) @@ -213,7 +213,7 @@ def test_write_bools_sliced(lmdb_version_store_tiny_segment, data): @pytest.mark.parametrize("num_cols", [1, 2, 3, 4, 5]) def test_write_sliced_with_index(lmdb_version_store_tiny_segment, num_rows, num_cols): lib = lmdb_version_store_tiny_segment - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() lib_tool = lib.library_tool() sym = "test_write_sliced_with_index" @@ -253,7 +253,7 @@ def test_write_sliced_with_index(lmdb_version_store_tiny_segment, num_rows, num_ def test_many_record_batches_many_slices(version_store_factory, rows_per_slice): rng = np.random.default_rng() lib = version_store_factory(segment_row_size=rows_per_slice) - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() sym = "test_many_record_batches_many_slices" record_batch_sizes = [1, 2, 1, 15, 13, 2, 1, 10] @@ -309,7 +309,7 @@ def test_write_view_strings(lmdb_version_store_arrow, type): def test_write_owned_and_non_owned_buffers(lmdb_version_store_tiny_segment): # This test is about our ChunkedBuffer holding mixes of owned and non-owned blocks, not Arrow views lib = lmdb_version_store_tiny_segment - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() sym = "test_write_owned_and_non_owned_buffers" rb0 = pa.RecordBatch.from_arrays([pa.array([0, 1, 2], pa.int32())], names=["col"]) @@ -628,7 +628,7 @@ def test_update_with_date_range_narrower_than_data(lmdb_version_store_arrow, dat def test_staging_without_sorting(version_store_factory, method): lib = version_store_factory(segment_row_size=2, dynamic_schema=True) lib_tool = lib.library_tool() - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() sym = "test_staging_without_sorting" table_0 = pa.table( @@ -672,7 +672,7 @@ def test_staging_without_sorting(version_store_factory, method): def test_staging_with_sorting(version_store_factory): lib = version_store_factory(segment_row_size=2, dynamic_schema=True) lib_tool = lib.library_tool() - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() sym = "test_staging_with_sorting" table_0 = pa.table( @@ -706,7 +706,7 @@ def test_staging_with_sorting(version_store_factory): def test_staging_with_sorting_strings(version_store_factory): lib = version_store_factory(segment_row_size=2, dynamic_schema=True) lib_tool = lib.library_tool() - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() sym = "test_staging_with_sorting_strings" table_0 = pa.table( @@ -865,7 +865,7 @@ def test_arrow_writes_hypothesis( cols_per_slice = num_supported_types // max_col_slices lib.lib_cfg().lib_desc.version.write_options.segment_row_size = rows_per_slice lib.lib_cfg().lib_desc.version.write_options.column_group_size = cols_per_slice - lib.set_output_format("experimental_arrow") + lib.set_output_format("pyarrow") lib._set_allow_arrow_input() naughty_strings = read_big_list_of_naughty_strings() data = {}