Skip to content

Testing for unicode handling across our API #2224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 13, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions python/arcticdb/util/test.py
Original file line number Diff line number Diff line change
@@ -244,8 +244,16 @@ def assert_frame_equal_rebuild_index_first(expected: pd.DataFrame, actual: pd.Da
assert_frame_equal(left=expected, right=actual)


unicode_symbol = "\u00A0" # start of latin extensions
unicode_symbols = "".join([chr(ord(unicode_symbol) + i) for i in range(100)])


def random_string(length: int):
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
if random.randint(0, 3) == 0:
# (probably) Give a unicode string one time in three, we have special handling in C++ for unicode
return "".join(random.choice(string.ascii_uppercase + unicode_symbols) for _ in range(length))
else:
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))


def get_sample_dataframe(size=1000, seed=0, str_size=10):
@@ -433,7 +441,15 @@ def get_pickle():
)[np.random.randint(0, 2)]


def random_strings_of_length(num, length, unique):
def random_ascii_strings(count, max_length):
result = []
for _ in range(count):
length = random.randrange(max_length + 1)
result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
return result


def random_strings_of_length(num, length, unique=False):
out = []
for i in range(num):
out.append(random_string(length))
Original file line number Diff line number Diff line change
@@ -458,47 +458,6 @@ def test_prune_previous_versions_append_batch(basic_store):
assert len(lib_tool.find_keys(KeyType.SYMBOL_LIST)) == 4


def test_batch_append_unicode(basic_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
basic_store.batch_write(symbols=[symbol], data_vector=[df1])
vit = basic_store.batch_read([symbol])[symbol]
assert_equal(vit.data, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
data={"a": ["123", uc]},
)
basic_store.batch_append(symbols=[symbol], data_vector=[df2])
vit = basic_store.batch_read([symbol])[symbol]
expected = pd.concat([df1, df2])
assert_equal(vit.data, expected)


def test_batch_write_metadata_unicode(basic_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)

basic_store.batch_write(symbols=[symbol], data_vector=[df1])
vit = basic_store.batch_read([symbol])[symbol]
assert_equal(vit.data, df1)

meta = {"a": 1, "b": uc}
basic_store.batch_write_metadata(symbols=[symbol], metadata_vector=[meta])
vits = basic_store.batch_read_metadata([symbol])
metadata = vits[symbol].metadata
assert metadata == meta


def test_deleting_unknown_symbol(basic_store, symbol):
df = sample_dataframe()

Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@
import pytest

from arcticdb.config import Defaults
from arcticdb.util.test import sample_dataframe
from arcticdb.util.test import sample_dataframe, random_ascii_strings
from arcticdb.version_store._store import NativeVersionStore
from arcticdb.toolbox.library_tool import (
VariantKey,
@@ -25,10 +25,9 @@

from multiprocessing import Pool
from arcticdb_ext import set_config_int
import random
import string
from tests.util.mark import MACOS_CONDA_BUILD


@pytest.fixture
def small_max_delta():
set_config_int("SymbolList.MaxDelta", 2)
@@ -278,16 +277,6 @@ def test_lock_contention(small_max_delta, basic_store, mode):
assert lt.find_keys(KeyType.SYMBOL_LIST) != orig_sl


def random_strings(count, max_length):
result = []
for _ in range(count):
length = random.randrange(max_length) + 2
result.append(
"".join(random.choice(string.ascii_letters) for _ in range(length))
)
return result


def _tiny_df(idx):
return pd.DataFrame(
{"x": np.arange(idx % 10, idx % 10 + 10)},
@@ -346,16 +335,16 @@ def test_symbol_list_parallel_stress_with_delete(
num_cycles = 1
symbol_length = 6

pre_existing_symbols = random_strings(num_pre_existing_symbols, symbol_length)
pre_existing_symbols = random_ascii_strings(num_pre_existing_symbols, symbol_length)
for idx, existing in enumerate(pre_existing_symbols):
lib.write(existing, _tiny_df(idx))

if same_symbols:
frozen_symbols = random_strings(num_symbols, symbol_length)
frozen_symbols = random_ascii_strings(num_symbols, symbol_length)
symbols = [frozen_symbols for _ in range(num_workers)]
else:
symbols = [
random_strings(num_symbols, symbol_length) for _ in range(num_workers)
random_ascii_strings(num_symbols, symbol_length) for _ in range(num_workers)
]

with Pool(num_workers) as p:
Original file line number Diff line number Diff line change
@@ -361,7 +361,7 @@ def proc_to_examine():
run the test from command line again to assure it runs ok before commit
"""
max_mem_bytes = 295_623_040
max_mem_bytes = 350_000_000

check_process_memory_leaks(proc_to_examine, 20, max_mem_bytes, 80.0)

@@ -408,7 +408,7 @@ def proc_to_examine():
del queries
gc.collect()

max_mem_bytes = 550_623_040
max_mem_bytes = 650_000_000

check_process_memory_leaks(proc_to_examine, 10, max_mem_bytes, 80.0)

@@ -705,7 +705,7 @@ def test_mem_leak_querybuilder_read_batch_memray(library_with_symbol):
mem_query(lib, df, read_batch=True)

@MEMRAY_TESTS_MARK
@pytest.mark.limit_memory("490 MB")
@pytest.mark.limit_memory("600 MB")
@pytest.mark.skipif(MACOS, reason="Mac OS mem usage is harder to predicts than WINDOWS")
def test_mem_limit_querybuilder_read_memray(library_with_symbol):
"""
@@ -719,7 +719,7 @@ def test_mem_limit_querybuilder_read_memray(library_with_symbol):
mem_query(lib, df)

@MEMRAY_TESTS_MARK
@pytest.mark.limit_memory("490 MB")
@pytest.mark.limit_memory("600 MB")
@pytest.mark.skipif(MACOS, reason="Mac OS mem usage is harder to predicts than WINDOWS")
def test_mem_limit_querybuilder_read_batch_memray(library_with_symbol):
"""
22 changes: 0 additions & 22 deletions python/tests/unit/arcticdb/version_store/test_append.py
Original file line number Diff line number Diff line change
@@ -32,28 +32,6 @@ def test_append_simple(lmdb_version_store):
assert_frame_equal(vit.data, expected)


def test_append_unicode(lmdb_version_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
lmdb_version_store.write(symbol, df1)
vit = lmdb_version_store.read(symbol)
assert_frame_equal(vit.data, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
data={"a": ["123", uc]},
)
lmdb_version_store.append(symbol, df2)
vit = lmdb_version_store.read(symbol)
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)


@pytest.mark.parametrize("empty_types", (True, False))
@pytest.mark.parametrize("dynamic_schema", (True, False))
def test_append_range_index(version_store_factory, empty_types, dynamic_schema):
23 changes: 6 additions & 17 deletions python/tests/unit/arcticdb/version_store/test_sort.py
Original file line number Diff line number Diff line change
@@ -2,12 +2,11 @@
import numpy as np
import arcticdb as adb
from arcticdb.util.test import assert_frame_equal
import random
import string

from arcticdb_ext.storage import KeyType
from arcticdb_ext.version_store import SortedValue

from arcticdb.util.test import random_strings_of_length


def test_stage_finalize(arctic_library):
symbol = "AAPL"
@@ -73,16 +72,6 @@ def test_stage_finalize_dynamic(arctic_client, lib_name):
pd.testing.assert_frame_equal(result, expected)


def random_strings(count, max_length):
result = []
for _ in range(count):
length = random.randrange(max_length) + 2
result.append(
"".join(random.choice(string.ascii_letters) for _ in range(length))
)
return result


def test_stage_finalize_strings(arctic_library):
symbol = "AAPL"
sort_cols = ["timestamp", "col1"]
@@ -91,14 +80,14 @@ def test_stage_finalize_strings(arctic_library):
"timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
"col1": np.arange(1, 51),
"col2": [f"a{i:02d}" for i in range(1, 51)],
"col3": random_strings(50, 12)
"col3": random_strings_of_length(50, 12)
}).set_index("timestamp")

df2 = pd.DataFrame({
"timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
"col1": np.arange(51, 101),
"col2": [f"b{i:02d}" for i in range(1, 51)],
"col3": random_strings(50, 12)
"col3": random_strings_of_length(50, 12)
}).set_index("timestamp")

df1_shuffled = df1.sample(frac=1)
@@ -122,15 +111,15 @@ def test_stage_finalize_strings_dynamic(arctic_client, lib_name):
"timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
"col1": np.arange(1, 51),
"col2": [f"a{i:02d}" for i in range(1, 51)],
"col3": random_strings(50, 12)
"col3": random_strings_of_length(50, 12)
}).set_index("timestamp")

df2 = pd.DataFrame({
"timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
"col1": np.arange(51, 101),
"col2": [f"b{i:02d}" for i in range(1, 51)],
"col4": [f"a{i:02d}" for i in range(101, 151)],
"col5": random_strings(50, 12)
"col5": random_strings_of_length(50, 12)
}).set_index("timestamp")

df1_shuffled = df1.sample(frac=1)
17 changes: 5 additions & 12 deletions python/tests/unit/arcticdb/version_store/test_string_dedup.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,6 @@
"""
import gc
import random
import string
import sys

import numpy as np
@@ -16,13 +15,7 @@

from datetime import datetime as dt


def random_strings(count, max_length):
result = []
for _ in range(count):
length = random.randrange(max_length + 1)
result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
return result
from arcticdb.util.test import random_ascii_strings


def generate_dataframe(columns, number_of_rows, strings, index_start="2000-1-1"):
@@ -50,7 +43,7 @@ def getsize(df):
def test_string_dedup_basic(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
symbol = "test_string_dedup_basic"
original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_strings(100, 10))
original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_ascii_strings(100, 10))
lib.write(symbol, original_df, dynamic_strings=True)
read_df_with_dedup = lib.read(symbol, optimise_string_memory=True).data
read_df_without_dedup = lib.read(symbol, optimise_string_memory=False).data
@@ -63,7 +56,7 @@ def test_string_dedup_basic(lmdb_version_store_tiny_segment):
def test_string_dedup_dynamic_schema(lmdb_version_store_dynamic_schema):
lib = lmdb_version_store_dynamic_schema
symbol = "test_string_dedup_dynamic_schema"
unique_strings = random_strings(100, 10)
unique_strings = random_ascii_strings(100, 10)
original_df = generate_dataframe(["col1"], 1000, unique_strings, "2000-1-1")
# This will be different to original_df, as the value in each row is chosen at random from the unique string pool
append_df = generate_dataframe(["col1"], 1000, unique_strings, "2010-1-1")
@@ -91,7 +84,7 @@ def test_string_dedup_nans(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
symbol = "test_string_dedup_nans"
# Throw a nan into the unique string pool
unique_strings = random_strings(9, 10)
unique_strings = random_ascii_strings(9, 10)
unique_strings.append(np.nan)
columns = ["col1", "col2", "col3", "col4"]
original_df = generate_dataframe(columns, 1000, unique_strings)
@@ -141,7 +134,7 @@ def test_string_dedup_performance(lmdb_version_store):

for unique_string in unique_strings:
for string_length in string_lengths:
string_pool = random_strings(unique_string, string_length)
string_pool = random_ascii_strings(unique_string, string_length)
for rows in number_of_rows:
print("Unique strings: {}".format(unique_string))
print("String length: {}".format(string_length))
343 changes: 343 additions & 0 deletions python/tests/unit/arcticdb/version_store/test_unicode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,343 @@
"""
Copyright 2025 Man Group Operations Limited
Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
We have special handling in the codebase when working with unicode Python strings, since we need to take the GIL
to handle them. This file checks that our APIs work even when passed unicode string."""
import datetime

import numpy as np
import pandas as pd
import pytest

from arcticdb.util.test import assert_frame_equal, random_strings_of_length

from arcticdb.version_store.library import Library
from arcticdb.version_store.library import UpdatePayload
from arcticdb_ext.storage import NoDataFoundException

unicode_str = "\u0420\u043e\u0441\u0441\u0438\u044f"
copyright = "My Thing Not Your's \u00A9"
trademark = "My Word Not Your's \u2122"
metadata = {copyright: trademark}
symbol = "sym"


def unicode_strs_df(start_date: pd.Timestamp, num_rows: int) -> pd.DataFrame:
index = [start_date + datetime.timedelta(days=i) for i in range(num_rows)]
df = pd.DataFrame(
index=index,
data={"a": random_strings_of_length(num_rows, 10), trademark: np.arange(num_rows), copyright: [unicode_str] * num_rows},
)
return df


@pytest.mark.parametrize("parallel", (True, False))
@pytest.mark.parametrize("multi_index", (True, False))
def test_write(lmdb_version_store_tiny_segment, parallel, multi_index):
lib = lmdb_version_store_tiny_segment
start = pd.Timestamp("2018-01-02")
num_rows = 100
if multi_index:
index = pd.MultiIndex.from_arrays([[start + datetime.timedelta(days=i) for i in range(num_rows)], [unicode_str] * num_rows])
else:
index = pd.date_range(start=start, periods=num_rows)

df = pd.DataFrame(
index=index,
data={"a": random_strings_of_length(num_rows, 10), trademark: np.arange(num_rows), copyright: [unicode_str] * num_rows},
)

if parallel:
lib.write(symbol, df, parallel=True)
lib.compact_incomplete(symbol, append=False, convert_int_to_float=False, metadata=metadata)
else:
lib.write(symbol, df, metadata=metadata)

lib.create_column_stats(symbol, column_stats={trademark: {"MINMAX"}})
vit = lib.read(symbol)
assert_frame_equal(vit.data, df)
assert vit.metadata == metadata


def test_write_metadata(lmdb_version_store):
lmdb_version_store.write("sym", [1, 2, 3], metadata=metadata)
assert lmdb_version_store.read("sym").metadata == metadata


def test_batch_write_metadata(lmdb_version_store):
syms = [f"sym_{i}" for i in range(100)]
metadata_vector = [metadata] * 100
lmdb_version_store.batch_write_metadata(symbols=syms, metadata_vector=metadata_vector)

for s in syms:
assert lmdb_version_store.read(s).metadata == metadata


def test_batch_append(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
start = pd.Timestamp("2018-01-02")
df1 = unicode_strs_df(start, 100)
lib.batch_write(symbols=[symbol], data_vector=[df1])
vit = lib.batch_read([symbol])[symbol]
assert_frame_equal(vit.data, df1)

df2 = unicode_strs_df(start + datetime.timedelta(days=100), 100)
lib.batch_append(symbols=[symbol], data_vector=[df2])
vit = lib.batch_read([symbol])[symbol]
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)


def test_batch_write_with_metadata(lmdb_version_store):
df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", unicode_str]},
)

lmdb_version_store.batch_write(symbols=[symbol], data_vector=[df1])
vit = lmdb_version_store.batch_read([symbol])[symbol]
assert_frame_equal(vit.data, df1)

meta = {"a": 1, "b": unicode_str}
lmdb_version_store.batch_write_metadata(symbols=[symbol], metadata_vector=[meta])
vits = lmdb_version_store.batch_read_metadata([symbol])
metadata = vits[symbol].metadata
assert metadata == meta


def test_append(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
start = pd.Timestamp("2018-01-02")
df1 = unicode_strs_df(start, 100)
lib.write(symbol, df1)
vit = lib.read(symbol)
assert_frame_equal(vit.data, df1)

df2 = unicode_strs_df(start + datetime.timedelta(days=100), 100)
lib.append(symbol, df2)
vit = lib.read(symbol)
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)


@pytest.mark.parametrize("api_method", ("write", "append"))
def test_staged_append(lmdb_version_store_tiny_segment, api_method):
lib = lmdb_version_store_tiny_segment
df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={copyright: ["123", unicode_str]},
)
lib.write(symbol, df1)
vit = lib.read(symbol)
assert_frame_equal(vit.data, df1)

expected = [df1]
for i in range(20):
start = pd.Timestamp("2018-01-02") + datetime.timedelta(days=(i + 1) * 2)
index = pd.date_range(start=start, periods=2)
df = pd.DataFrame(
index=index,
data={copyright: ["123", unicode_str]},
)
if api_method == "write":
lib.write(symbol, df, parallel=True)
elif api_method == "append":
lib.append(symbol, df, incomplete=True)
else:
raise RuntimeError("Unexpected api_method")
expected.append(df)

lib.compact_incomplete(symbol, append=True, convert_int_to_float=False)
vit = lib.read(symbol)
expected = pd.concat(expected)
assert_frame_equal(vit.data, expected)


def test_update(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
start = pd.Timestamp("2018-01-02")
df1 = unicode_strs_df(start, 100)
lib.update(symbol, df1, upsert=True)
vit = lib.read(symbol)
assert_frame_equal(vit.data, df1)

df2 = unicode_strs_df(start + datetime.timedelta(days=100), 100)
lib.update(symbol, df2)
vit = lib.read(symbol)
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)

df1_new = unicode_strs_df(start + datetime.timedelta(days=1), 100)
lib.update(symbol, df1_new)
vit = lib.read(symbol)
expected = pd.concat([df1, df2])
expected.update(df1_new)
assert_frame_equal(vit.data, expected, check_dtype=False) # disable check_dtype to pass with older Pandas versions


def test_batch_update(lmdb_version_store):
lib = lmdb_version_store
adb_lib = Library("desc", lib)
sym_1 = "sym_1"
sym_2 = "sym_2"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={copyright: ["123", unicode_str]},
)
lmdb_version_store.write(sym_1, df1)
lmdb_version_store.write(sym_2, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-03"), pd.Timestamp("2018-01-05")],
data={copyright: ["456", trademark]},
)
df3 = pd.DataFrame(
index=[pd.Timestamp("2018-02-04"), pd.Timestamp("2018-02-05")],
data={copyright: ["789", trademark]},
)

update_payloads = [UpdatePayload(sym_1, df2), UpdatePayload(sym_2, df3)]
adb_lib.update_batch(update_payloads)

vit = adb_lib.read(sym_1)
expected = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03"), pd.Timestamp("2018-01-05")],
data ={copyright: ["123", "456", trademark]}
)
assert_frame_equal(vit.data, expected)

vit = adb_lib.read(sym_2)
expected = pd.concat([df1, df3])
assert_frame_equal(vit.data, expected)


def test_snapshots(lmdb_version_store):
"""We should probably validate against snapshots with unicode names like we do for symbols, but these tests check
the status quo.
Monday: 8667974441 to validate against this.
"""
start = pd.Timestamp("2018-01-02")
index = pd.date_range(start=start, periods=4)

df = pd.DataFrame(
index=index,
data={"a": ["123", unicode_str, copyright, trademark], trademark: [1, 2, 3, 4], copyright: [unicode_str] * 4},
)

# Test snapshots with unicode names
lmdb_version_store.write(symbol, df, metadata=metadata)
lmdb_version_store.snapshot(copyright)
lmdb_version_store.snapshot(unicode_str, metadata=metadata)
lmdb_version_store.write(symbol, [1, 2, 3])

vit = lmdb_version_store.read(symbol, as_of=copyright)
assert_frame_equal(vit.data, df)

snapshots = lmdb_version_store.list_snapshots()
assert snapshots == {copyright: None, unicode_str: metadata}

# Test deleting a snapshot with unicode name
lmdb_version_store.delete_snapshot(copyright)
snapshots = lmdb_version_store.list_snapshots()
assert snapshots == {unicode_str: metadata}
with pytest.raises(NoDataFoundException):
lmdb_version_store.read(symbol, as_of=copyright)
vit = lmdb_version_store.read(symbol, as_of=unicode_str)
assert_frame_equal(vit.data, df)

# Test adding to a snapshot with unicode name
lmdb_version_store.write("new_sym", df, metadata=metadata)
lmdb_version_store.add_to_snapshot(unicode_str, ["new_sym"])
lmdb_version_store.delete("new_sym")
vit = lmdb_version_store.read("new_sym", as_of=unicode_str)
assert_frame_equal(vit.data, df)

# Test list_versions
vers = lmdb_version_store.list_versions("new_sym")
assert len(vers) == 1
assert vers[0]["snapshots"] == [unicode_str]

# Test removing from a snapshot with unicode name
lmdb_version_store.remove_from_snapshot(unicode_str, ["new_sym"], versions=[0])
assert lmdb_version_store.list_versions("new_sym") == []
with pytest.raises(NoDataFoundException):
lmdb_version_store.read("new_sym", as_of=copyright)


@pytest.mark.parametrize("batch", (True, False))
def test_get_info(lmdb_version_store, batch):
start = pd.Timestamp("2018-01-02")
index = pd.date_range(start=start, periods=4)
unicode_str = "ab"

df_1 = pd.DataFrame(
index=index,
data={"a": ["123", unicode_str, copyright, trademark], trademark: [1, 2, 3, 4], copyright: [unicode_str] * 4},
)
df_1.index.set_names([unicode_str])

df_2 = pd.DataFrame(
index=index,
data={unicode_str: [1, 2, 3, 4], trademark: [1, 2, 3, 4], copyright: [unicode_str] * 4},
)
df_2.index.set_names([unicode_str])
lmdb_version_store.write("sym_1", df_1, metadata=metadata)
lmdb_version_store.write("sym_2", df_2, metadata=metadata)

if batch:
res = lmdb_version_store.batch_get_info(symbols=["sym_1", "sym_2"])
assert len(res) == 2
assert list(df_1.columns) == res[0]["col_names"]["columns"]
assert list(df_2.columns) == res[1]["col_names"]["columns"]
else:
for sym, df in [("sym_1", df_1), ("sym_2", df_2)]:
res = lmdb_version_store.get_info(sym)
assert list(df.columns) == res["col_names"]["columns"]
# assert res["col_names"]["index"] == [unicode_str] # index names are not exposed by get_info, seems to be a bug 8667920777


def sample_nested_structures():
return [
{"a": ["abc", "def", copyright, trademark, unicode_str], "b": random_strings_of_length(num=8, length=5, unique=False)},
(random_strings_of_length(num=10, length=6, unique=True), random_strings_of_length(num=10, length=9, unique=True)),
]


@pytest.mark.parametrize("batch_read", (True, False))
def test_recursively_written_data_with_metadata(lmdb_version_store, batch_read):
samples = sample_nested_structures()

for idx, sample in enumerate(samples):
sym = "sym_recursive" + str(idx)
metadata = {unicode_str: 1}
lmdb_version_store.write(sym, sample, metadata=metadata, recursive_normalizers=True)
if batch_read:
vit = lmdb_version_store.batch_read([sym])[sym]
else:
vit = lmdb_version_store.read(sym)
assert sample == vit.data
assert vit.symbol == sym
assert vit.metadata == metadata


def test_recursively_written_data_with_metadata_batch_write(lmdb_version_store):
samples = sample_nested_structures()
syms = [f"sym_{i}" for i in range(len(samples))]
metadata = [{unicode_str: i} for i in range(len(samples))]

lmdb_version_store.batch_write(symbols=syms, data_vector=samples, metadata_vector=metadata)

res = lmdb_version_store.batch_read(syms)
assert len(res) == len(syms)

for i, sym in enumerate(syms):
assert sym in res
assert res[sym].symbol == sym
assert res[sym].data == samples[i]
assert res[sym].metadata == metadata[i]
32 changes: 0 additions & 32 deletions python/tests/unit/arcticdb/version_store/test_update.py
Original file line number Diff line number Diff line change
@@ -69,38 +69,6 @@ def test_update(version_store_factory):
assert_frame_equal(vit.data, df)


def test_update_unicode(lmdb_version_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
lmdb_version_store.update(symbol, df1, upsert=True)
vit = lmdb_version_store.read(symbol)
assert_frame_equal(vit.data, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
data={"a": ["123", uc]},
)
lmdb_version_store.update(symbol, df2)
vit = lmdb_version_store.read(symbol)
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)

uc_new = "\u0420\u043e\u0441\u0441\u0438\u044f_new"
df1_new = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc_new]},
)
lmdb_version_store.update(symbol, df1_new)
vit = lmdb_version_store.read(symbol)
expected = pd.concat([df1_new, df2])
assert_frame_equal(vit.data, expected)


def test_update_long_strides(s3_version_store):
lib = s3_version_store
symbol = "test_update_long_strides"
26 changes: 0 additions & 26 deletions python/tests/unit/arcticdb/version_store/test_write.py
Original file line number Diff line number Diff line change
@@ -158,29 +158,3 @@ def test_write_only_nan_column(self, lmdb_version_store, dtype):
lib.write(sym, pd.DataFrame({"a": [np.nan]}, dtype=dtype))
data = lib.read(sym).data
assert_frame_equal(data, pd.DataFrame({"a": [np.nan]}, dtype=dtype))

def test_write_unicode(lmdb_version_store):
symbol = "test_write_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
lmdb_version_store.write(symbol, df1)
vit = lmdb_version_store.read(symbol)
assert_frame_equal(vit.data, df1)


def test_write_parallel_unicode(lmdb_version_store):
symbol = "test_write_parallel_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
lmdb_version_store.write(symbol, df1, parallel=True)
lmdb_version_store.compact_incomplete(symbol, append=False, convert_int_to_float=False)
vit = lmdb_version_store.read(symbol)
assert_frame_equal(vit.data, df1)

Unchanged files with check annotations Beta