Skip to content

Commit

Permalink
Break: New testing suite
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Jan 4, 2024
1 parent 3e3a1fb commit c9aad69
Show file tree
Hide file tree
Showing 23 changed files with 1,110 additions and 895 deletions.
29 changes: 23 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ endif()

# Function to set compiler-specific flags
function(set_compiler_flags target)
target_include_directories(${target} PRIVATE scripts)
target_link_libraries(${target} PRIVATE ${STRINGZILLA_TARGET_NAME})
set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
${CMAKE_BINARY_DIR})
Expand All @@ -104,14 +105,30 @@ function(set_compiler_flags target)
endfunction()

if(${STRINGZILLA_BUILD_BENCHMARK})
add_executable(stringzilla_search_bench scripts/search_bench.cpp)
set_compiler_flags(stringzilla_search_bench)
add_test(NAME stringzilla_search_bench COMMAND stringzilla_search_bench)
add_executable(stringzilla_bench_search scripts/bench_search.cpp)
set_compiler_flags(stringzilla_bench_search)
add_test(NAME stringzilla_bench_search COMMAND stringzilla_bench_search)

add_executable(stringzilla_bench_similarity scripts/bench_similarity.cpp)
set_compiler_flags(stringzilla_bench_similarity)
add_test(NAME stringzilla_bench_similarity COMMAND stringzilla_bench_similarity)

add_executable(stringzilla_bench_sort scripts/bench_sort.cpp)
set_compiler_flags(stringzilla_bench_sort)
add_test(NAME stringzilla_bench_sort COMMAND stringzilla_bench_sort)

add_executable(stringzilla_bench_token scripts/bench_token.cpp)
set_compiler_flags(stringzilla_bench_token)
add_test(NAME stringzilla_bench_token COMMAND stringzilla_bench_token)

add_executable(stringzilla_bench_container scripts/bench_container.cpp)
set_compiler_flags(stringzilla_bench_container)
add_test(NAME stringzilla_bench_container COMMAND stringzilla_bench_container)
endif()

if(${STRINGZILLA_BUILD_TEST})
# Test target
add_executable(stringzilla_search_test scripts/search_test.cpp)
set_compiler_flags(stringzilla_search_test)
add_test(NAME stringzilla_search_test COMMAND stringzilla_search_test)
add_executable(stringzilla_test scripts/test.cpp)
set_compiler_flags(stringzilla_test)
add_test(NAME stringzilla_test COMMAND stringzilla_test)
endif()
58 changes: 39 additions & 19 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,36 @@ The project is split into the following parts:

- `include/stringzilla/stringzilla.h` - single-header C implementation.
- `include/stringzilla/stringzilla.hpp` - single-header C++ wrapper.
- `python/**` - Python bindings.
- `javascript/**` - JavaScript bindings.
- `scripts/**` - Scripts for benchmarking and testing.
- `python/*` - Python bindings.
- `javascript/*` - JavaScript bindings.
- `scripts/*` - Scripts for benchmarking and testing.

The scripts name convention is as follows: `<workload>_<nature>.<language>`.
An example would be, `search_bench.cpp` or `similarity_fuzz.py`.
The nature of the script can be:
For minimal test coverage, check the following scripts:

- `bench` - bounded in time benchmarking, generally on user-provided data.
- `fuzz` - unbounded in time fuzzing, generally on randomly generated data.
- `test` - unit tests.
- `test.cpp` - tests C++ API (not underlying C) against STL.
- `test.py` - tests Python API against native strings.
- `test.js`.

At the C++ level all benchmarks also validate the results against the STL baseline, serving as tests on real-world data.
They have the broadest coverage of the library, and are the most important to keep up-to-date:

- `bench_token.cpp` - token-level ops, like hashing, ordering, equality checks.
- `bench_search.cpp` - bidirectional substring search, both exact and fuzzy.
- `bench_similarity.cpp` - benchmark all edit distance backends.
- `bench_sort.cpp` - sorting, partitioning, merging.
- `bench_container.cpp` - STL containers with different string keys.

The role of Python benchmarks is less to provide absolute number, but to compare against popular tools in the Python ecosystem.

- `bench_search.py` - compares against native Python `str`.
- `bench_sort.py` - compares against `pandas`.
- `bench_similarity.py` - compares against `jellyfish`, `editdistance`, etc.

For presentation purposes, we also

## IDE Integrations

The project is developed in VS Code, and comes with debugger launchers in `.vscode/launch.json`.

## Contributing in C++ and C

Expand All @@ -40,16 +59,16 @@ Using modern syntax, this is how you build and run the test suite:
```bash
cmake -DSTRINGZILLA_BUILD_TEST=1 -B ./build_debug
cmake --build ./build_debug --config Debug # Which will produce the following targets:
./build_debug/search_test # Unit test for substring search
./build_debug/stringzilla_test # Unit test for the entire library
```

For benchmarks, you can use the following commands:

```bash
cmake -DSTRINGZILLA_BUILD_BENCHMARK=1 -B ./build_release
cmake --build ./build_release --config Release # Which will produce the following targets:
./build_release/search_bench # Benchmark for substring search
./build_release/sort_bench # Benchmark for sorting arrays of strings
./build_release/stringzilla_bench_search # Benchmark for substring search
./build_release/stringzilla_bench_sort # Benchmark for sorting arrays of strings
```

Running on modern hardware, you may want to compile the code for older generations to compare the relative performance.
Expand All @@ -67,9 +86,9 @@ cmake -DCMAKE_BUILD_TYPE=Release -DSTRINGZILLA_BUILD_BENCHMARK=1 \
-DCMAKE_CXX_FLAGS="-march=sapphirerapids" -DCMAKE_C_FLAGS="-march=sapphirerapids" \
-B ./build_release/sapphirerapids && cmake --build build_release/sapphirerapids --config Release

./build_release/sandybridge/stringzilla_search_bench
./build_release/haswell/stringzilla_search_bench
./build_release/sapphirerapids/stringzilla_search_bench
./build_release/sandybridge/stringzilla_bench_search
./build_release/haswell/stringzilla_bench_search
./build_release/sapphirerapids/stringzilla_bench_search
```

Alternatively, you may want to compare the performance of the code compiled with different compilers.
Expand All @@ -95,8 +114,8 @@ pip install -e . # To build locally from source
For testing we use PyTest, which may not be installed on your system.

```bash
pip install pytest # To install PyTest
pytest scripts/ -s -x # To run the test suite
pip install pytest # To install PyTest
pytest scripts/unit_test.py -s -x # To run the test suite
```

For fuzzing we love the ability to call the native C implementation from Python bypassing the binding layer.
Expand All @@ -110,8 +129,8 @@ python scripts/similarity_fuzz.py # To run the fuzzing script
For benchmarking, the following scripts are provided.

```sh
python scripts/search_bench.py --haystack_path "your file" --needle "your pattern" # real data
python scripts/search_bench.py --haystack_pattern "abcd" --haystack_length 1e9 --needle "abce" # synthetic data
python scripts/bench_search.py --haystack_path "your file" --needle "your pattern" # real data
python scripts/bench_search.py --haystack_pattern "abcd" --haystack_length 1e9 --needle "abce" # synthetic data
python scripts/similarity_bench.py --text_path "your file" # edit ditance computations
```

Expand All @@ -132,6 +151,7 @@ Future development plans include:
- [x] [Reverse-order operations](https://github.com/ashvardanian/StringZilla/issues/12).
- [ ] [Faster string sorting algorithm](https://github.com/ashvardanian/StringZilla/issues/45).
- [ ] [Splitting with multiple separators at once](https://github.com/ashvardanian/StringZilla/issues/29).
- [ ] Add `.pyi` interface fior Python.
- [ ] Arm NEON backend.
- [ ] Bindings for Rust.
- [ ] Arm SVE backend.
Expand Down
89 changes: 88 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Aside from exact search, the library also accelerates fuzzy search, edit distanc
- Code in C? Replace LibC's `<string.h>` with C 99 `<stringzilla.h>` - [_more_](#quick-start-c-🛠️)
- Code in C++? Replace STL's `<string>` with C++ 11 `<stringzilla.hpp>` - [_more_](#quick-start-cpp-🛠️)
- Code in Python? Upgrade your `str` to faster `Str` - [_more_](#quick-start-python-🐍)
- Code in other languages? Let us know!

__Features:__

Expand Down Expand Up @@ -131,7 +132,7 @@ import stringzilla as sz
contains: bool = sz.contains("haystack", "needle", start=0, end=9223372036854775807)
offset: int = sz.find("haystack", "needle", start=0, end=9223372036854775807)
count: int = sz.count("haystack", "needle", start=0, end=9223372036854775807, allowoverlap=False)
levenshtein: int = sz.levenshtein("needle", "nidl")
edit_distance: int = sz.edit_distance("needle", "nidl")
```

## Quick Start: C/C++ 🛠️
Expand Down Expand Up @@ -202,6 +203,19 @@ haystack.contains(needle) == true; // STL has this only from C++ 23 onwards
haystack.compare(needle) == 1; // Or `haystack <=> needle` in C++ 20 and beyond
```

StringZilla also provides string literals for automatic type resolution, [similar to STL][stl-literal]:

```cpp
using sz::literals::operator""_sz;
using std::literals::operator""sv;

auto a = "some string"; // char const *
auto b = "some string"sv; // std::string_view
auto b = "some string"_sz; // sz::string_view
```

[stl-literal]: https://en.cppreference.com/w/cpp/string/basic_string_view/operator%22%22sv

### Memory Ownership and Small String Optimization

Most operations in StringZilla don't assume any memory ownership.
Expand Down Expand Up @@ -334,6 +348,73 @@ Debugging pointer offsets is not a pleasant exercise, so keep the following func
- `haystack.[r]split_all(character_set(""))`

For $N$ matches the split functions will report $N+1$ matches, potentially including empty strings.
Ranges have a few convinience methods as well:

```cpp
range.size(); // -> std::size_t
range.empty(); // -> bool
range.template to<std::set<std::sting>>();
range.template to<std::vector<std::sting_view>>();
```

### TODO: STL Containers with String Keys

The C++ Standard Templates Library provides several associative containers, often used with string keys.

```cpp
std::map<std::string, int, std::less<std::string>> sorted_words;
std::unordered_map<std::string, int, std::hash<std::string>, std::equal_to<std::string>> words;
```

The performance of those containers is often limited by the performance of the string keys, especially on reads.
StringZilla can be used to accelerate containers with `std::string` keys, by overriding the default comparator and hash functions.

```cpp
std::map<std::string, int, sz::string_view_less> sorted_words;
std::unordered_map<std::string, int, sz::string_view_hash, sz::string_view_equal_to> words;
```

Alternatively, a better approach would be to use the `sz::string` class as a key.
The right hash function and comparator would be automatically selected and the performance gains would be more noticeable if the keys are short.

```cpp
std::map<sz::string, int> sorted_words;
std::unordered_map<sz::string, int> words;
```

### TODO: Concatenating Strings

Ansother common string operation is concatenation.
The STL provides `std::string::operator+` and `std::string::append`, but those are not the most efficient, if multiple invocations are performed.

```cpp
std::string name, domain, tld;
auto email = name + "@" + domain + "." + tld; // 4 allocations
```

The efficient approach would be to pre-allocate the memory and copy the strings into it.

```cpp
std::string email;
email.reserve(name.size() + domain.size() + tld.size() + 2);
email.append(name), email.append("@"), email.append(domain), email.append("."), email.append(tld);
```

That's mouthful and error-prone.
StringZilla provides a more convenient `concat` function, which takes a variadic number of arguments.

```cpp
auto email = sz::concat(name, "@", domain, ".", tld);
```

Moreover, if the first or second argument of the expression is a StringZilla string, the concatenation can be poerformed lazily using the same `operator+` syntax.
That behavior is disabled for compatibility by default, but can be enabled by defining `SZ_LAZY_CONCAT` macro.

```cpp
sz::string name, domain, tld;
auto email_expression = name + "@" + domain + "." + tld; // 0 allocations
sz::string email = name + "@" + domain + "." + tld; // 1 allocations
```

### Debugging

Expand All @@ -342,6 +423,12 @@ That behavior is controllable for both C and C++ interfaces via the `STRINGZILLA

[faq-sso]: https://cpp-optimizations.netlify.app/small_strings/

## Algorithms 📚

### Hashing

### Substring Search

## Contributing 👾

Please check out the [contributing guide](CONTRIBUTING.md) for more details on how to setup the development environment and contribute to this project.
Expand Down
8 changes: 4 additions & 4 deletions include/stringzilla/stringzilla.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@
* @brief Annotation for the public API symbols.
*/
#if defined(_WIN32) || defined(__CYGWIN__)
#define SZ_PUBLIC __declspec(dllexport) inline static
#define SZ_PUBLIC inline static
#elif __GNUC__ >= 4
#define SZ_PUBLIC __attribute__((visibility("default"))) inline static
#define SZ_PUBLIC inline static
#else
#define SZ_PUBLIC inline static
#endif
Expand Down Expand Up @@ -717,11 +717,11 @@ SZ_PUBLIC sz_cptr_t sz_find_last_bounded_regex(sz_cptr_t haystack, sz_size_t h_l
* @return Unsigned edit distance.
*/
SZ_PUBLIC sz_size_t sz_edit_distance(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
sz_size_t bound, sz_memory_allocator_t const *alloc);
sz_size_t bound, sz_memory_allocator_t const *alloc);

/** @copydoc sz_edit_distance */
SZ_PUBLIC sz_size_t sz_edit_distance_serial(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
sz_size_t bound, sz_memory_allocator_t const *alloc);
sz_size_t bound, sz_memory_allocator_t const *alloc);

/** @copydoc sz_edit_distance */
SZ_PUBLIC sz_size_t sz_edit_distance_avx512(sz_cptr_t a, sz_size_t a_length, sz_cptr_t b, sz_size_t b_length, //
Expand Down
3 changes: 3 additions & 0 deletions include/stringzilla/stringzilla.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include <string_view>
#endif

#include <cassert> // `assert`
#include <cstddef> // `std::size_t`

#include <stringzilla/stringzilla.h>

namespace ashvardanian {
Expand Down
8 changes: 4 additions & 4 deletions python/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,7 @@ static PyObject *Str_count(PyObject *self, PyObject *args, PyObject *kwargs) {
return PyLong_FromSize_t(count);
}

static PyObject *Str_levenshtein(PyObject *self, PyObject *args, PyObject *kwargs) {
static PyObject *Str_edit_distance(PyObject *self, PyObject *args, PyObject *kwargs) {
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
Py_ssize_t nargs = PyTuple_Size(args);
if (nargs < !is_member + 1 || nargs > !is_member + 2) {
Expand Down Expand Up @@ -1093,7 +1093,7 @@ static PyObject *Str_levenshtein(PyObject *self, PyObject *args, PyObject *kwarg
sz_memory_allocator_t reusing_allocator;
reusing_allocator.allocate = &temporary_memory_allocate;
reusing_allocator.free = &temporary_memory_free;
reusing_allocator.user_data = &temporary_memory;
reusing_allocator.handle = &temporary_memory;

sz_size_t distance =
sz_edit_distance(str1.start, str1.length, str2.start, str2.length, (sz_size_t)bound, &reusing_allocator);
Expand Down Expand Up @@ -1469,7 +1469,7 @@ static PyMethodDef Str_methods[] = {
{"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."},
{"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
{"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
{"levenshtein", Str_levenshtein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
{"edit_distance", Str_edit_distance, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
{NULL, NULL, 0, NULL}};

static PyTypeObject StrType = {
Expand Down Expand Up @@ -1763,7 +1763,7 @@ static PyMethodDef stringzilla_methods[] = {
{"splitlines", Str_splitlines, sz_method_flags_m, "Split a string by line breaks."},
{"startswith", Str_startswith, sz_method_flags_m, "Check if a string starts with a given prefix."},
{"endswith", Str_endswith, sz_method_flags_m, "Check if a string ends with a given suffix."},
{"levenshtein", Str_levenshtein, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
{"edit_distance", Str_edit_distance, sz_method_flags_m, "Calculate the Levenshtein distance between two strings."},
{NULL, NULL, 0, NULL}};

static PyModuleDef stringzilla_module = {
Expand Down
Loading

0 comments on commit c9aad69

Please sign in to comment.