Skip to content

Commit e96ec38

Browse files
author
Alex Seaton
committed
Extended testing of how we handle unicode strings
1 parent 710bde1 commit e96ec38

File tree

9 files changed

+268
-169
lines changed

9 files changed

+268
-169
lines changed

python/arcticdb/util/test.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,16 @@ def assert_frame_equal_rebuild_index_first(expected: pd.DataFrame, actual: pd.Da
244244
assert_frame_equal(left=expected, right=actual)
245245

246246

247+
unicode_symbol = "\u00A0" # start of latin extensions
248+
unicode_symbols = "".join([chr(ord(unicode_symbol) + i) for i in range(100)])
249+
250+
247251
def random_string(length: int):
248-
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
252+
if random.randint(0, 3) == 0:
253+
# Give a unicode string one time in three, we have special handling in C++ for unicode
254+
return "".join(random.choice(string.ascii_uppercase + unicode_symbols) for _ in range(length))
255+
else:
256+
return "".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
249257

250258

251259
def get_sample_dataframe(size=1000, seed=0, str_size=10):
@@ -433,7 +441,15 @@ def get_pickle():
433441
)[np.random.randint(0, 2)]
434442

435443

436-
def random_strings_of_length(num, length, unique):
444+
def random_ascii_strings(count, max_length):
445+
result = []
446+
for _ in range(count):
447+
length = random.randrange(max_length + 1)
448+
result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
449+
return result
450+
451+
452+
def random_strings_of_length(num, length, unique=False):
437453
out = []
438454
for i in range(num):
439455
out.append(random_string(length))

python/tests/integration/arcticdb/version_store/test_basic_version_store.py

-41
Original file line numberDiff line numberDiff line change
@@ -459,47 +459,6 @@ def test_prune_previous_versions_append_batch(basic_store):
459459
assert len(lib_tool.find_keys(KeyType.SYMBOL_LIST)) == 4
460460

461461

462-
def test_batch_append_unicode(basic_store):
463-
symbol = "test_append_unicode"
464-
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
465-
466-
df1 = pd.DataFrame(
467-
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
468-
data={"a": ["123", uc]},
469-
)
470-
basic_store.batch_write(symbols=[symbol], data_vector=[df1])
471-
vit = basic_store.batch_read([symbol])[symbol]
472-
assert_equal(vit.data, df1)
473-
474-
df2 = pd.DataFrame(
475-
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
476-
data={"a": ["123", uc]},
477-
)
478-
basic_store.batch_append(symbols=[symbol], data_vector=[df2])
479-
vit = basic_store.batch_read([symbol])[symbol]
480-
expected = pd.concat([df1, df2])
481-
assert_equal(vit.data, expected)
482-
483-
484-
def test_batch_write_metadata_unicode(basic_store):
485-
symbol = "test_append_unicode"
486-
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
487-
df1 = pd.DataFrame(
488-
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
489-
data={"a": ["123", uc]},
490-
)
491-
492-
basic_store.batch_write(symbols=[symbol], data_vector=[df1])
493-
vit = basic_store.batch_read([symbol])[symbol]
494-
assert_equal(vit.data, df1)
495-
496-
meta = {"a": 1, "b": uc}
497-
basic_store.batch_write_metadata(symbols=[symbol], metadata_vector=[meta])
498-
vits = basic_store.batch_read_metadata([symbol])
499-
metadata = vits[symbol].metadata
500-
assert metadata == meta
501-
502-
503462
def test_deleting_unknown_symbol(basic_store, symbol):
504463
df = sample_dataframe()
505464

python/tests/integration/arcticdb/version_store/test_symbol_list.py

+5-16
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import pytest
1111

1212
from arcticdb.config import Defaults
13-
from arcticdb.util.test import sample_dataframe
13+
from arcticdb.util.test import sample_dataframe, random_ascii_strings
1414
from arcticdb.version_store._store import NativeVersionStore
1515
from arcticdb.toolbox.library_tool import (
1616
VariantKey,
@@ -25,10 +25,9 @@
2525

2626
from multiprocessing import Pool
2727
from arcticdb_ext import set_config_int
28-
import random
29-
import string
3028
from tests.util.mark import MACOS_CONDA_BUILD
3129

30+
3231
@pytest.fixture
3332
def small_max_delta():
3433
set_config_int("SymbolList.MaxDelta", 2)
@@ -278,16 +277,6 @@ def test_lock_contention(small_max_delta, basic_store, mode):
278277
assert lt.find_keys(KeyType.SYMBOL_LIST) != orig_sl
279278

280279

281-
def random_strings(count, max_length):
282-
result = []
283-
for _ in range(count):
284-
length = random.randrange(max_length) + 2
285-
result.append(
286-
"".join(random.choice(string.ascii_letters) for _ in range(length))
287-
)
288-
return result
289-
290-
291280
def _tiny_df(idx):
292281
return pd.DataFrame(
293282
{"x": np.arange(idx % 10, idx % 10 + 10)},
@@ -346,16 +335,16 @@ def test_symbol_list_parallel_stress_with_delete(
346335
num_cycles = 1
347336
symbol_length = 6
348337

349-
pre_existing_symbols = random_strings(num_pre_existing_symbols, symbol_length)
338+
pre_existing_symbols = random_ascii_strings(num_pre_existing_symbols, symbol_length)
350339
for idx, existing in enumerate(pre_existing_symbols):
351340
lib.write(existing, _tiny_df(idx))
352341

353342
if same_symbols:
354-
frozen_symbols = random_strings(num_symbols, symbol_length)
343+
frozen_symbols = random_ascii_strings(num_symbols, symbol_length)
355344
symbols = [frozen_symbols for _ in range(num_workers)]
356345
else:
357346
symbols = [
358-
random_strings(num_symbols, symbol_length) for _ in range(num_workers)
347+
random_ascii_strings(num_symbols, symbol_length) for _ in range(num_workers)
359348
]
360349

361350
with Pool(num_workers) as p:

python/tests/unit/arcticdb/version_store/test_append.py

-22
Original file line numberDiff line numberDiff line change
@@ -32,28 +32,6 @@ def test_append_simple(lmdb_version_store):
3232
assert_frame_equal(vit.data, expected)
3333

3434

35-
def test_append_unicode(lmdb_version_store):
36-
symbol = "test_append_unicode"
37-
uc = "\u0420\u043e\u0441\u0441\u0438\u044f"
38-
39-
df1 = pd.DataFrame(
40-
index=[pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-03")],
41-
data={"a": ["123", uc]},
42-
)
43-
lmdb_version_store.write(symbol, df1)
44-
vit = lmdb_version_store.read(symbol)
45-
assert_frame_equal(vit.data, df1)
46-
47-
df2 = pd.DataFrame(
48-
index=[pd.Timestamp("2018-01-04"), pd.Timestamp("2018-01-05")],
49-
data={"a": ["123", uc]},
50-
)
51-
lmdb_version_store.append(symbol, df2)
52-
vit = lmdb_version_store.read(symbol)
53-
expected = pd.concat([df1, df2])
54-
assert_frame_equal(vit.data, expected)
55-
56-
5735
@pytest.mark.parametrize("empty_types", (True, False))
5836
@pytest.mark.parametrize("dynamic_schema", (True, False))
5937
def test_append_range_index(version_store_factory, empty_types, dynamic_schema):

python/tests/unit/arcticdb/version_store/test_sort.py

+6-17
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,11 @@
22
import numpy as np
33
import arcticdb as adb
44
from arcticdb.util.test import assert_frame_equal
5-
import random
6-
import string
7-
85
from arcticdb_ext.storage import KeyType
96
from arcticdb_ext.version_store import SortedValue
107

8+
from arcticdb.util.test import random_strings_of_length
9+
1110

1211
def test_stage_finalize(arctic_library):
1312
symbol = "AAPL"
@@ -73,16 +72,6 @@ def test_stage_finalize_dynamic(arctic_client, lib_name):
7372
pd.testing.assert_frame_equal(result, expected)
7473

7574

76-
def random_strings(count, max_length):
77-
result = []
78-
for _ in range(count):
79-
length = random.randrange(max_length) + 2
80-
result.append(
81-
"".join(random.choice(string.ascii_letters) for _ in range(length))
82-
)
83-
return result
84-
85-
8675
def test_stage_finalize_strings(arctic_library):
8776
symbol = "AAPL"
8877
sort_cols = ["timestamp", "col1"]
@@ -91,14 +80,14 @@ def test_stage_finalize_strings(arctic_library):
9180
"timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
9281
"col1": np.arange(1, 51),
9382
"col2": [f"a{i:02d}" for i in range(1, 51)],
94-
"col3": random_strings(50, 12)
83+
"col3": random_strings_of_length(50, 12)
9584
}).set_index("timestamp")
9685

9786
df2 = pd.DataFrame({
9887
"timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
9988
"col1": np.arange(51, 101),
10089
"col2": [f"b{i:02d}" for i in range(1, 51)],
101-
"col3": random_strings(50, 12)
90+
"col3": random_strings_of_length(50, 12)
10291
}).set_index("timestamp")
10392

10493
df1_shuffled = df1.sample(frac=1)
@@ -122,15 +111,15 @@ def test_stage_finalize_strings_dynamic(arctic_client, lib_name):
122111
"timestamp": pd.date_range("2023-01-01", periods=25, freq="H").repeat(2),
123112
"col1": np.arange(1, 51),
124113
"col2": [f"a{i:02d}" for i in range(1, 51)],
125-
"col3": random_strings(50, 12)
114+
"col3": random_strings_of_length(50, 12)
126115
}).set_index("timestamp")
127116

128117
df2 = pd.DataFrame({
129118
"timestamp": pd.date_range("2023-01-04", periods=25, freq="H").repeat(2),
130119
"col1": np.arange(51, 101),
131120
"col2": [f"b{i:02d}" for i in range(1, 51)],
132121
"col4": [f"a{i:02d}" for i in range(101, 151)],
133-
"col5": random_strings(50, 12)
122+
"col5": random_strings_of_length(50, 12)
134123
}).set_index("timestamp")
135124

136125
df1_shuffled = df1.sample(frac=1)

python/tests/unit/arcticdb/version_store/test_string_dedup.py

+5-12
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
"""
88
import gc
99
import random
10-
import string
1110
import sys
1211

1312
import numpy as np
@@ -16,13 +15,7 @@
1615

1716
from datetime import datetime as dt
1817

19-
20-
def random_strings(count, max_length):
21-
result = []
22-
for _ in range(count):
23-
length = random.randrange(max_length + 1)
24-
result.append("".join(random.choice(string.ascii_letters) for _ in range(length)))
25-
return result
18+
from arcticdb.util.test import random_ascii_strings
2619

2720

2821
def generate_dataframe(columns, number_of_rows, strings, index_start="2000-1-1"):
@@ -50,7 +43,7 @@ def getsize(df):
5043
def test_string_dedup_basic(lmdb_version_store_tiny_segment):
5144
lib = lmdb_version_store_tiny_segment
5245
symbol = "test_string_dedup_basic"
53-
original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_strings(100, 10))
46+
original_df = generate_dataframe(["col1", "col2", "col3", "col4"], 1000, random_ascii_strings(100, 10))
5447
lib.write(symbol, original_df, dynamic_strings=True)
5548
read_df_with_dedup = lib.read(symbol, optimise_string_memory=True).data
5649
read_df_without_dedup = lib.read(symbol, optimise_string_memory=False).data
@@ -63,7 +56,7 @@ def test_string_dedup_basic(lmdb_version_store_tiny_segment):
6356
def test_string_dedup_dynamic_schema(lmdb_version_store_dynamic_schema):
6457
lib = lmdb_version_store_dynamic_schema
6558
symbol = "test_string_dedup_dynamic_schema"
66-
unique_strings = random_strings(100, 10)
59+
unique_strings = random_ascii_strings(100, 10)
6760
original_df = generate_dataframe(["col1"], 1000, unique_strings, "2000-1-1")
6861
# This will be different to original_df, as the value in each row is chosen at random from the unique string pool
6962
append_df = generate_dataframe(["col1"], 1000, unique_strings, "2010-1-1")
@@ -91,7 +84,7 @@ def test_string_dedup_nans(lmdb_version_store_tiny_segment):
9184
lib = lmdb_version_store_tiny_segment
9285
symbol = "test_string_dedup_nans"
9386
# Throw a nan into the unique string pool
94-
unique_strings = random_strings(9, 10)
87+
unique_strings = random_ascii_strings(9, 10)
9588
unique_strings.append(np.nan)
9689
columns = ["col1", "col2", "col3", "col4"]
9790
original_df = generate_dataframe(columns, 1000, unique_strings)
@@ -141,7 +134,7 @@ def test_string_dedup_performance(lmdb_version_store):
141134

142135
for unique_string in unique_strings:
143136
for string_length in string_lengths:
144-
string_pool = random_strings(unique_string, string_length)
137+
string_pool = random_ascii_strings(unique_string, string_length)
145138
for rows in number_of_rows:
146139
print("Unique strings: {}".format(unique_string))
147140
print("String length: {}".format(string_length))

0 commit comments

Comments
 (0)