Skip to content

Commit

Permalink
Add: Baseline C++ class
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Dec 21, 2023
1 parent 44ef989 commit 4bec620
Show file tree
Hide file tree
Showing 11 changed files with 970 additions and 325 deletions.
30 changes: 26 additions & 4 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
"version": "0.2.0",
"configurations": [
{
"name": "Test",
"name": "Debug Unit Tests",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/build_debug/stringzilla_bench",
"program": "${workspaceFolder}/build_debug/stringzilla_test_substring",
"cwd": "${workspaceFolder}",
"environment": [
{
Expand All @@ -18,11 +18,33 @@
],
"stopAtEntry": false,
"linux": {
"preLaunchTask": "Linux Build C++ Test Debug",
"preLaunchTask": "Build for Linux: Debug",
"MIMode": "gdb"
},
"osx": {
"preLaunchTask": "MacOS Build C++ Test Debug",
"preLaunchTask": "Build for MacOS: Debug",
"MIMode": "lldb"
}
},
{
"name": "Debug Benchmarks",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/build_debug/stringzilla_bench_substring",
"cwd": "${workspaceFolder}",
"environment": [
{
"name": "ASAN_OPTIONS",
"value": "detect_leaks=0:atexit=1:strict_init_order=1:strict_string_checks=1"
}
],
"stopAtEntry": false,
"linux": {
"preLaunchTask": "Build for Linux: Debug",
"MIMode": "gdb"
},
"osx": {
"preLaunchTask": "Build for MacOS: Debug",
"MIMode": "lldb"
}
}
Expand Down
16 changes: 8 additions & 8 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,32 @@
"version": "2.0.0",
"tasks": [
{
"label": "Linux Build C++ Test Debug",
"command": "cmake -DSTRINGZILLA_BUILD_TEST=1 -DCMAKE_CXX_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_BUILD_TYPE=Debug -B ./build_debug && make -C ./build_debug",
"label": "Build for Linux: Debug",
"command": "cmake -DSTRINGZILLA_BUILD_TEST=1 -DSTRINGZILLA_BUILD_BENCHMARK=1 -DCMAKE_CXX_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_BUILD_TYPE=Debug -B ./build_debug && make -C ./build_debug",
"args": [],
"type": "shell",
"problemMatcher": [
"$gcc"
]
},
{
"label": "Linux Build C++ Test Release",
"command": "cmake -DSTRINGZILLA_BUILD_TEST=1 -DCMAKE_CXX_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_BUILD_TYPE=RelWithDebInfo -B ./build_release && make -C ./build_release",
"label": "Build for Linux: Release",
"command": "cmake -DSTRINGZILLA_BUILD_TEST=1 -DSTRINGZILLA_BUILD_BENCHMARK=1 -DCMAKE_CXX_COMPILER=gcc-12 -DCMAKE_CXX_COMPILER=g++-12 -DCMAKE_BUILD_TYPE=RelWithDebInfo -B ./build_release && make -C ./build_release",
"args": [],
"type": "shell",
"problemMatcher": [
"$gcc"
]
},
{
"label": "MacOS Build C++ Test Debug",
"command": "cmake -DSTRINGZILLA_BUILD_TEST=1 -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++ -DCMAKE_BUILD_TYPE=Debug -B ./build_debug && make -C ./build_debug",
"label": "Build for MacOS: Debug",
"command": "cmake -DSTRINGZILLA_BUILD_TEST=1 -DSTRINGZILLA_BUILD_BENCHMARK=1 -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++ -DCMAKE_BUILD_TYPE=Debug -B ./build_debug && make -C ./build_debug",
"args": [],
"type": "shell",
},
{
"label": "MacOS Build C++ Test Release",
"command": "cmake -DSTRINGZILLA_BUILD_TEST=1 -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++ -DCMAKE_BUILD_TYPE=RelWithDebInfo -B ./build_release && make -C ./build_release",
"label": "Build for MacOS: Release",
"command": "cmake -DSTRINGZILLA_BUILD_TEST=1 -DSTRINGZILLA_BUILD_BENCHMARK=1 -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++ -DCMAKE_BUILD_TYPE=RelWithDebInfo -B ./build_release && make -C ./build_release",
"args": [],
"type": "shell"
}
Expand Down
47 changes: 26 additions & 21 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,35 +71,40 @@ if(STRINGZILLA_INSTALL)
DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
endif()

if(${STRINGZILLA_BUILD_TEST} OR ${STRINGZILLA_BUILD_BENCHMARK})
add_executable(stringzilla_bench scripts/bench_substring.cpp)
target_link_libraries(stringzilla_bench PRIVATE ${STRINGZILLA_TARGET_NAME})
set_target_properties(stringzilla_bench PROPERTIES RUNTIME_OUTPUT_DIRECTORY
if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER
3.13)
include(CTest)
enable_testing()
endif()

# Function to set compiler-specific flags
function(set_compiler_flags target)
target_link_libraries(${target} PRIVATE ${STRINGZILLA_TARGET_NAME})
set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
${CMAKE_BINARY_DIR})
target_link_options(stringzilla_bench PRIVATE
"-Wl,--unresolved-symbols=ignore-all")

# Check for compiler and set flags for stringzilla_bench
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
# Set -march=native and -fmax-errors=1 for all build types
target_compile_options(stringzilla_bench PRIVATE "-march=native")
target_compile_options(stringzilla_bench PRIVATE "-fmax-errors=1")

# Set -O3 for Release build, and -g for Debug and RelWithDebInfo
target_compile_options(stringzilla_bench PRIVATE
target_compile_options(${target} PRIVATE "-march=native")
target_compile_options(${target} PRIVATE "-fmax-errors=1")
target_compile_options(${target} PRIVATE
"$<$<CONFIG:Release>:-O3>"
"$<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:-g>")
elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
# Intel specific flags
target_compile_options(stringzilla_bench PRIVATE "-xHost")
target_compile_options(${target} PRIVATE "-xHost")
elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
# MSVC specific flags or other settings
endif()
endfunction()

if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER
3.13)
include(CTest)
enable_testing()
add_test(NAME stringzilla_bench COMMAND stringzilla_bench)
endif()
if(${STRINGZILLA_BUILD_BENCHMARK})
add_executable(stringzilla_bench_substring scripts/bench_substring.cpp)
set_compiler_flags(stringzilla_bench_substring)
add_test(NAME stringzilla_bench_substring COMMAND stringzilla_bench_substring)
endif()

if(${STRINGZILLA_BUILD_TEST})
# Test target
add_executable(stringzilla_test_substring scripts/test_substring.cpp)
set_compiler_flags(stringzilla_test_substring)
add_test(NAME stringzilla_test_substring COMMAND stringzilla_test_substring)
endif()
49 changes: 45 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@ StringZilla is the Godzilla of string libraries, searching, splitting, sorting,
-[Radix](https://en.wikipedia.org/wiki/Radix_sort)-like sorting faster than C++ `std::sort`
-[Memory-mapping](https://en.wikipedia.org/wiki/Memory-mapped_file) to work with larger-than-RAM datasets

Putting this into a table:

| Feature \ Library | STL | LibC | StringZilla |
| :------------------- | ---: | ---: | ---------------: |
| Substring Search | | | |
| Reverse Order Search | || |
| Fuzzy Search ||| |
| Edit Distance ||| |
| Interface | C++ | C | C , C++ , Python |


Who is this for?

- you want to process strings faster than default strings in Python, C, or C++
Expand All @@ -22,7 +33,6 @@ Limitations:
- Assumes ASCII or UTF-8 encoding
- Assumes 64-bit address space


This library saved me tens of thousands of dollars pre-processing large datasets for machine learning, even on the scale of a single experiment.
So if you want to process the 6 Billion images from [LAION](https://laion.ai/blog/laion-5b/), or the 250 Billion web pages from the [CommonCrawl](https://commoncrawl.org/), or even just a few million lines of server logs, and haunted by Python's `open(...).readlines()` and `str().splitlines()` taking forever, this should help 😊

Expand Down Expand Up @@ -216,15 +226,15 @@ Running benchmarks:
```sh
cmake -DCMAKE_BUILD_TYPE=Release -DSTRINGZILLA_BUILD_TEST=1 -B ./build_release
cmake --build build_release --config Release
./build_release/stringzilla_bench
./build_release/stringzilla_bench_substring
```

Running tests:

```sh
cmake -DCMAKE_BUILD_TYPE=Debug -DSTRINGZILLA_BUILD_TEST=1 -B ./build_debug
cmake --build build_debug --config Debug
./build_debug/stringzilla_bench
./build_debug/stringzilla_bench_substring
```

On MacOS it's recommended to use non-default toolchain:
Expand All @@ -240,7 +250,7 @@ cmake -B ./build_release \
-DSTRINGZILLA_USE_OPENMP=1 \
-DSTRINGZILLA_BUILD_TEST=1 \
&& \
make -C ./build_release -j && ./build_release/stringzilla_bench
make -C ./build_release -j && ./build_release/stringzilla_bench_substring
```

## License 📜
Expand All @@ -257,3 +267,34 @@ If you like this project, you may also enjoy [USearch][usearch], [UCall][ucall],
[ustore]: https://github.com/unum-cloud/ustore
[simsimd]: https://github.com/ashvardanian/simsimd
[tenpack]: https://github.com/ashvardanian/tenpack


# The weirdest interfaces of C++23 strings:

## Third `std::basic_string_view<CharT,Traits>::find`

constexpr size_type find( basic_string_view v, size_type pos = 0 ) const noexcept;
(1) (since C++17)
constexpr size_type find( CharT ch, size_type pos = 0 ) const noexcept;
(2) (since C++17)
constexpr size_type find( const CharT* s, size_type pos, size_type count ) const;
(3) (since C++17)
constexpr size_type find( const CharT* s, size_type pos = 0 ) const;
(4) (since C++17)


## HTML Parsing

```txt
<tag> Isolated tag start
<tag\w Tag start with attributes
<tag/> Self-closing tag
</tag> Tag end
```

In any case, the tag name is always followed by whitespace, `/` or `>`.
And is always preceded by whitespace. `/` or `<`.

Important distinctions between XML and HTML:

- XML does not truncate multiple white-spaces, while HTML does.
Loading

0 comments on commit 4bec620

Please sign in to comment.