Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Testing for unicode handling across our API #2224

Merged
merged 1 commit into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions python/arcticdb/util/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,16 @@ def assert_frame_equal_rebuild_index_first(expected: pd.DataFrame, actual: pd.Da
assert_frame_equal(left=expected, right=actual)


unicode_symbol = "\u00A0" # start of latin extensions
unicode_symbols = "".join([chr(ord(unicode_symbol) + i) for i in range(100)])


def random_string(length: int):
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
if random.randint(0, 3) == 0:
# (probably) Give a unicode string one time in three, we have special handling in C++ for unicode
return "".join(random.choice(string.ascii_uppercase + unicode_symbols) for _ in range(length))
else:
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))


def get_sample_dataframe(size=1000, seed=0, str_size=10):
Expand Down Expand Up @@ -433,7 +441,15 @@ def get_pickle():
)[np.random.randint(0, 2)]


def random_strings_of_length(num, length, unique):
def random_ascii_strings(count, max_length):
result = []
for _ in range(count):
length = random.randrange(max_length + 1)
result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
return result


def random_strings_of_length(num, length, unique=False):
out = []
for i in range(num):
out.append(random_string(length))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -458,47 +458,6 @@ def test_prune_previous_versions_append_batch(basic_store):
assert len(lib_tool.find_keys(KeyType.SYMBOL_LIST)) == 4


def test_batch_append_unicode(basic_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
basic_store.batch_write(symbols=[symbol], data_vector=[df1])
vit = basic_store.batch_read([symbol])[symbol]
assert_equal(vit.data, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
data={"a": ["123", uc]},
)
basic_store.batch_append(symbols=[symbol], data_vector=[df2])
vit = basic_store.batch_read([symbol])[symbol]
expected = pd.concat([df1, df2])
assert_equal(vit.data, expected)


def test_batch_write_metadata_unicode(basic_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)

basic_store.batch_write(symbols=[symbol], data_vector=[df1])
vit = basic_store.batch_read([symbol])[symbol]
assert_equal(vit.data, df1)

meta = {"a": 1, "b": uc}
basic_store.batch_write_metadata(symbols=[symbol], metadata_vector=[meta])
vits = basic_store.batch_read_metadata([symbol])
metadata = vits[symbol].metadata
assert metadata == meta


def test_deleting_unknown_symbol(basic_store, symbol):
df = sample_dataframe()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pytest

from arcticdb.config import Defaults
from arcticdb.util.test import sample_dataframe
from arcticdb.util.test import sample_dataframe, random_ascii_strings
from arcticdb.version_store._store import NativeVersionStore
from arcticdb.toolbox.library_tool import (
VariantKey,
Expand All @@ -25,10 +25,9 @@

from multiprocessing import Pool
from arcticdb_ext import set_config_int
import random
import string
from tests.util.mark import MACOS_CONDA_BUILD


@pytest.fixture
def small_max_delta():
set_config_int("SymbolList.MaxDelta", 2)
Expand Down Expand Up @@ -278,16 +277,6 @@ def test_lock_contention(small_max_delta, basic_store, mode):
assert lt.find_keys(KeyType.SYMBOL_LIST) != orig_sl


def random_strings(count, max_length):
result = []
for _ in range(count):
length = random.randrange(max_length) + 2
result.append(
"".join(random.choice(string.ascii_letters) for _ in range(length))
)
return result


def _tiny_df(idx):
return pd.DataFrame(
{"x": np.arange(idx % 10, idx % 10 + 10)},
Expand Down Expand Up @@ -346,16 +335,16 @@ def test_symbol_list_parallel_stress_with_delete(
num_cycles = 1
symbol_length = 6

pre_existing_symbols = random_strings(num_pre_existing_symbols, symbol_length)
pre_existing_symbols = random_ascii_strings(num_pre_existing_symbols, symbol_length)
for idx, existing in enumerate(pre_existing_symbols):
lib.write(existing, _tiny_df(idx))

if same_symbols:
frozen_symbols = random_strings(num_symbols, symbol_length)
frozen_symbols = random_ascii_strings(num_symbols, symbol_length)
symbols = [frozen_symbols for _ in range(num_workers)]
else:
symbols = [
random_strings(num_symbols, symbol_length) for _ in range(num_workers)
random_ascii_strings(num_symbols, symbol_length) for _ in range(num_workers)
]

with Pool(num_workers) as p:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def proc_to_examine():
run the test from command line again to assure it runs ok before commit

"""
max_mem_bytes = 295_623_040
max_mem_bytes = 350_000_000

check_process_memory_leaks(proc_to_examine, 20, max_mem_bytes, 80.0)

Expand Down Expand Up @@ -408,7 +408,7 @@ def proc_to_examine():
del queries
gc.collect()

max_mem_bytes = 550_623_040
max_mem_bytes = 650_000_000

check_process_memory_leaks(proc_to_examine, 10, max_mem_bytes, 80.0)

Expand Down Expand Up @@ -705,7 +705,7 @@ def test_mem_leak_querybuilder_read_batch_memray(library_with_symbol):
mem_query(lib, df, read_batch=True)

@MEMRAY_TESTS_MARK
@pytest.mark.limit_memory("490 MB")
@pytest.mark.limit_memory("600 MB")
@pytest.mark.skipif(MACOS, reason="Mac OS mem usage is harder to predicts than WINDOWS")
def test_mem_limit_querybuilder_read_memray(library_with_symbol):
"""
Expand All @@ -719,7 +719,7 @@ def test_mem_limit_querybuilder_read_memray(library_with_symbol):
mem_query(lib, df)

@MEMRAY_TESTS_MARK
@pytest.mark.limit_memory("490 MB")
@pytest.mark.limit_memory("600 MB")
@pytest.mark.skipif(MACOS, reason="Mac OS mem usage is harder to predicts than WINDOWS")
def test_mem_limit_querybuilder_read_batch_memray(library_with_symbol):
"""
Expand Down
22 changes: 0 additions & 22 deletions python/tests/unit/arcticdb/version_store/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,6 @@ def test_append_simple(lmdb_version_store):
assert_frame_equal(vit.data, expected)


def test_append_unicode(lmdb_version_store):
symbol = "test_append_unicode"
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"

df1 = pd.DataFrame(
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
data={"a": ["123", uc]},
)
lmdb_version_store.write(symbol, df1)
vit = lmdb_version_store.read(symbol)
assert_frame_equal(vit.data, df1)

df2 = pd.DataFrame(
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
data={"a": ["123", uc]},
)
lmdb_version_store.append(symbol, df2)
vit = lmdb_version_store.read(symbol)
expected = pd.concat([df1, df2])
assert_frame_equal(vit.data, expected)


@pytest.mark.parametrize("empty_types", (True, False))
@pytest.mark.parametrize("dynamic_schema", (True, False))
def test_append_range_index(version_store_factory, empty_types, dynamic_schema):
Expand Down
23 changes: 6 additions & 17 deletions python/tests/unit/arcticdb/version_store/test_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,11 @@
import numpy as np
import arcticdb as adb
from arcticdb.util.test import assert_frame_equal
import random
import string

from arcticdb_ext.storage import KeyType
from arcticdb_ext.version_store import SortedValue

from arcticdb.util.test import random_strings_of_length


def test_stage_finalize(arctic_library):
symbol = "AAPL"
Expand Down Expand Up @@ -73,16 +72,6 @@ def test_stage_finalize_dynamic(arctic_client, lib_name):
pd.testing.assert_frame_equal(result, expected)


def random_strings(count, max_length):
result = []
for _ in range(count):
length = random.randrange(max_length) + 2
result.append(
"".join(random.choice(string.ascii_letters) for _ in range(length))
)
return result


def test_stage_finalize_strings(arctic_library):
symbol = "AAPL"
sort_cols = ["timestamp", "col1"]
Expand All @@ -91,14 +80,14 @@ def test_stage_finalize_strings(arctic_library):
"timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
"col1": np.arange(1, 51),
"col2": [f"a{i:02d}" for i in range(1, 51)],
"col3": random_strings(50, 12)
"col3": random_strings_of_length(50, 12)
}).set_index("timestamp")

df2 = pd.DataFrame({
"timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
"col1": np.arange(51, 101),
"col2": [f"b{i:02d}" for i in range(1, 51)],
"col3": random_strings(50, 12)
"col3": random_strings_of_length(50, 12)
}).set_index("timestamp")

df1_shuffled = df1.sample(frac=1)
Expand All @@ -122,15 +111,15 @@ def test_stage_finalize_strings_dynamic(arctic_client, lib_name):
"timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
"col1": np.arange(1, 51),
"col2": [f"a{i:02d}" for i in range(1, 51)],
"col3": random_strings(50, 12)
"col3": random_strings_of_length(50, 12)
}).set_index("timestamp")

df2 = pd.DataFrame({
"timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
"col1": np.arange(51, 101),
"col2": [f"b{i:02d}" for i in range(1, 51)],
"col4": [f"a{i:02d}" for i in range(101, 151)],
"col5": random_strings(50, 12)
"col5": random_strings_of_length(50, 12)
}).set_index("timestamp")

df1_shuffled = df1.sample(frac=1)
Expand Down
17 changes: 5 additions & 12 deletions python/tests/unit/arcticdb/version_store/test_string_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
"""
import gc
import random
import string
import sys

import numpy as np
Expand All @@ -16,13 +15,7 @@

from datetime import datetime as dt


def random_strings(count, max_length):
result = []
for _ in range(count):
length = random.randrange(max_length + 1)
result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
return result
from arcticdb.util.test import random_ascii_strings


def generate_dataframe(columns, number_of_rows, strings, index_start="2000-1-1"):
Expand Down Expand Up @@ -50,7 +43,7 @@ def getsize(df):
def test_string_dedup_basic(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
symbol = "test_string_dedup_basic"
original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_strings(100, 10))
original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_ascii_strings(100, 10))
lib.write(symbol, original_df, dynamic_strings=True)
read_df_with_dedup = lib.read(symbol, optimise_string_memory=True).data
read_df_without_dedup = lib.read(symbol, optimise_string_memory=False).data
Expand All @@ -63,7 +56,7 @@ def test_string_dedup_basic(lmdb_version_store_tiny_segment):
def test_string_dedup_dynamic_schema(lmdb_version_store_dynamic_schema):
lib = lmdb_version_store_dynamic_schema
symbol = "test_string_dedup_dynamic_schema"
unique_strings = random_strings(100, 10)
unique_strings = random_ascii_strings(100, 10)
original_df = generate_dataframe(["col1"], 1000, unique_strings, "2000-1-1")
# This will be different to original_df, as the value in each row is chosen at random from the unique string pool
append_df = generate_dataframe(["col1"], 1000, unique_strings, "2010-1-1")
Expand Down Expand Up @@ -91,7 +84,7 @@ def test_string_dedup_nans(lmdb_version_store_tiny_segment):
lib = lmdb_version_store_tiny_segment
symbol = "test_string_dedup_nans"
# Throw a nan into the unique string pool
unique_strings = random_strings(9, 10)
unique_strings = random_ascii_strings(9, 10)
unique_strings.append(np.nan)
columns = ["col1", "col2", "col3", "col4"]
original_df = generate_dataframe(columns, 1000, unique_strings)
Expand Down Expand Up @@ -141,7 +134,7 @@ def test_string_dedup_performance(lmdb_version_store):

for unique_string in unique_strings:
for string_length in string_lengths:
string_pool = random_strings(unique_string, string_length)
string_pool = random_ascii_strings(unique_string, string_length)
for rows in number_of_rows:
print("Unique strings: {}".format(unique_string))
print("String length: {}".format(string_length))
Expand Down
Loading
Loading