Merge branch 'dev'

gershnik · Jan 10, 2025 · e1edb6f · e1edb6f
2 parents 1dcab97 + cf87a43
commit e1edb6f
Show file tree

Hide file tree

Showing 44 changed files with 10,887 additions and 882 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -14,7 +14,7 @@ on:
       - 'tools/**'
 
 env:
-  BUILD_TYPE: Release
+  BUILD_TYPE: MinSizeRel
   NDK_VER: 27.2.12479018
   NDK_ARCH: x86_64
   NDK_API: 29
@@ -35,7 +35,7 @@ jobs:
         - {os: ubuntu-latest, compiler: gcc, version: 13 }
         - {os: ubuntu-24.04, compiler: gcc, version: 14 }
 
-        - {os: ubuntu-latest, compiler: clang, version: 16 }
+        - {os: ubuntu-22.04, compiler: clang, version: 16 }
         - {os: ubuntu-latest, compiler: clang, version: 17 }
         - {os: ubuntu-latest, compiler: clang, version: 18 }
 
@@ -51,9 +51,10 @@ jobs:
             wget https://apt.llvm.org/llvm.sh
             chmod u+x llvm.sh
             sudo ./llvm.sh ${{ matrix.version }} 
-            sudo apt-get install -y clang-tools-${{ matrix.version }}
+            sudo apt-get install -y clang-tools-${{ matrix.version }} libc++-${{ matrix.version }}-dev libc++abi-${{ matrix.version }}-dev
             echo "CC=clang-${{ matrix.version }}" >> $GITHUB_ENV
             echo "CXX=clang++-${{ matrix.version }}" >> $GITHUB_ENV
+            echo "CXXFLAGS=-stdlib=libc++" >> $GITHUB_ENV
           fi
 
           if [[ '${{ matrix.compiler }}' == 'gcc' ]]; then

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## Unreleased
 
+### Added
+
+- `grapheme_view` and `graphemes` adapter which provide ability to iterate over grapheme clusters in `sys_string` and any UTF range.
+
+### Changed
+
+- Unicode data has been optimized for better size/speed balance
+- `sys_string_t::hash_type` has been changed from `unsigned` to `size_t` on some platforms. 
+
+### Fixed
+
+- Invalid character access in unicode mappings.
+- Crash when sys_string_builder is re-used after `build()` on Apple and Python platforms.
+- `utf_ref_view` and `utf_owning_view` now actually work with forward and input underlying ranges
+- MSVC warnings when using `std::hash<sys_string>`
+
 ## [3.0] - 2024-12-02
 
 This is a major release with some breaking changes
@@ -57,6 +73,12 @@ This is a major release with some breaking changes
   behavior applies to `wchar_t` on platform where it is UTF-16 or UTF-32.
 - `operator<<` no longer pollutes global namespace
 
+## [2.15] - 2025-01-07
+
+### Fixed
+
+- Invalid character access in unicode mappings.
+
 ## [2.14] - 2024-05-02
 
 ### Fixed
@@ -207,4 +229,5 @@ This is a major release with some breaking changes
 [2.12]: https://github.com/gershnik/sys_string/releases/v2.12
 [2.13]: https://github.com/gershnik/sys_string/releases/v2.13
 [2.14]: https://github.com/gershnik/sys_string/releases/v2.14
+[2.15]: https://github.com/gershnik/sys_string/releases/v2.15
 [3.0]: https://github.com/gershnik/sys_string/releases/v3.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,9 +16,41 @@ string(STRIP ${SYSSTR_VERSION} SYSSTR_VERSION)
 
 project(sys_string VERSION ${SYSSTR_VERSION} LANGUAGES CXX)
 
+find_package (Python3 COMPONENTS Interpreter Development)
 
 add_subdirectory(lib)
 
+if(${Python3_Interpreter_FOUND})
+
+    file(GLOB UNICODE_DATA ${CMAKE_CURRENT_LIST_DIR}/unicode/data/*.txt)
+    file(GLOB UNICODE_SCRIPTS ${CMAKE_CURRENT_LIST_DIR}/unicode/scripts/*.py)
+
+    set(UNICODE_GENERATED_FILES
+        ${CMAKE_CURRENT_LIST_DIR}/lib/src/unicode_mappings.cpp 
+        ${CMAKE_CURRENT_LIST_DIR}/lib/inc/sys_string/impl/unicode/mappings.h
+        ${CMAKE_CURRENT_LIST_DIR}/test/test_grapheme_data.h
+    )
+
+    add_custom_command(
+        COMMENT "Generating Unicoode mappings"
+        OUTPUT ${UNICODE_GENERATED_FILES}
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/unicode/scripts/genmappings.py 
+                        ${CMAKE_CURRENT_LIST_DIR}/unicode/data 
+                        ${UNICODE_GENERATED_FILES}
+        DEPENDS 
+            ${UNICODE_DATA} 
+            ${UNICODE_SCRIPTS}
+    )
+
+    add_custom_target(generate_unicode_mappings
+        DEPENDS 
+            ${UNICODE_GENERATED_FILES}
+    )
+
+    add_dependencies(sys_string generate_unicode_mappings)
+
+endif()
+
 if (PROJECT_IS_TOP_LEVEL)
 
     include(lib/cmake/install.cmake)

diff --git a/README.md b/README.md
@@ -5,9 +5,8 @@
 [![License](https://img.shields.io/badge/license-BSD-brightgreen.svg)](https://opensource.org/licenses/BSD-3-Clause)
 [![Tests](https://github.com/gershnik/sys_string/actions/workflows/test.yml/badge.svg)](https://github.com/gershnik/sys_string/actions/workflows/test.yml)
 
-This library provides a C++ string class template `sys_string_t` that is optimized for **interoperability with external native string type**. It is **immutable**, **Unicode-first** and exposes convenient **operations similar to Python or ECMAScript strings**. It uses a separate `sys_string_builder_t` class template to construct strings. It provides fast concatenation via `+` operator that **does not allocate temporary strings**. 
-The library exposes bidirectional UTF-8/UTF-16/UTF-32 views of `sys_string_t` as well as of any C++ input ranges of chracters. 
-of characters.
+This library provides a C++ string class template `sys_string_t` that is optimized for **interoperability with external native string types**. It is **immutable**, **Unicode-first** and exposes convenient **operations similar to Python or ECMAScript strings**. It uses a separate `sys_string_builder_t` class template to construct strings. It provides fast concatenation via `+` operator that **does not allocate temporary strings**. 
+The library exposes bidirectional UTF-8/UTF-16/UTF-32 and grapheme cluster views of `sys_string_t` as well as of other C++ ranges of characters.
 
 ## What does it mean?
 
@@ -38,11 +37,16 @@ of characters.
 
 * **Concatenation does not allocate temporaries.** You can safely do things like `result = s1 + s2 + s3`. It will result in **one** memory allocation and 3 calls to `memcpy` to copy each of `s1`, `s2` and `s3` content into the final result. Not 2 allocations and 5 copies like in other languages or with `std::string`.
 
-* **Bidirectional UTF-8/UTF-16/UTF-32 views**. You can view `sys_string_t` as a sequence of UTF-8/16/32 characters and iterate forward or __backward__ equally efficiently. Consider trying to find last instance of Unicode whitespace in UTF-8 data. Doing it as fast as finding the first instance is non-trivial. The views also work on any random access containers (C array, `std::array`, `std::vector`, `std::string`) of characters. Thus you can iterate in UTF-8 over `std::vector<char16_t>` etc.
+* **Bidirectional UTF-8/UTF-16/UTF-32 views**. You can view `sys_string_t` as a sequence of UTF-8/16/32 characters and iterate forward or __backward__ equally efficiently. Consider trying to find last instance of Unicode whitespace in UTF-8 data. Doing it as fast as finding the first instance is non-trivial. The views also work on any C++ input ranges (C array, `std::array`, `std::vector`, `std::string` or even `std::ranges::istream_view`) of characters (`char`, `char8_t`, `char16_t`, `char32_t` and `wchar_t` on platforms where it is Unicode). Thus you can iterate in UTF-8 over `std::vector<char16_t>` etc.
+
+* **Bidirectional grapheme cluster views**. Similarly you can also further view any of the UTF-8/UTF-16/UTF-32 views of `sys_string_t` as a sequence of
+[grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) and iterate over them forward or __backward__ equally efficiently.
+Consider the task of erasing the last user perceived "character" from a string. To do so correctly you need to erase the last _grapheme cluster_. Doing it and doing it fast is very non-trivial. This functionality also works on any C++ range of characters (but requires a _forward_ range).
 
 ## Why bother? Doesn't `std::string` work well?
 
 An `std::string` storing UTF-8 (or `std::wstring` storing UTF-16 on Windows) works very well for some scenarios but fails miserably for others. `sys_string` class is an attempt to create something that works well in situations `std::string` would be a bad choice.
+
 Specifically, `std::basic_string` is an STL container of a character type that owns its memory and controls it via a user-supplied allocator. These design choices make it very fast for direct character access but create the following problems:
 
 * They rule out (efficient) reference-counted implementations. Which means that when you copy an `std::string` instance it must copy its content. Some of the penalty of that is alleviated by modern [small string optimization](https://akrzemi1.wordpress.com/2014/04/14/common-optimizations/) but this is, at best, a band-aid. There are workarounds, of course, such as using `std::shared_ptr<std::string>>` "when it matters" but they result in even more complexity for something that is quite fundamental to any data processing.

diff --git a/doc/Usage.md b/doc/Usage.md
@@ -14,7 +14,8 @@
 - [Iterating over string content](#iterating-over-string-content)
     - [Storage iteration](#storage-iteration)
     - [UTF iteration](#utf-iteration)
-    - [Helper: UTF iteration over any C++ character range](#helper-utf-iteration-over-any-c-character-range)
+    - [UTF iteration over any C++ character range](#utf-iteration-over-any-c-character-range)
+    - [Grapheme iteration](#grapheme-iteration)
 - [Substrings](#substrings)
 - [Accessing C strings](#accessing-c-strings)
 - [Accessing storage as C array](#accessing-storage-as-c-array)
@@ -307,7 +308,7 @@ utf32_access::iterator first = access.reverse(access.rend());
 
 ```
 
-### Helper: UTF iteration over any C++ character range
+### UTF iteration over any C++ character range
 
 Since the internal facility to perform UTF iteration is quite generic this library exposes it to allow you to perform UTF iteration over any C++ input range of compatible characters (`char`, `char8_t`, `char16_t`, `char32_t`, and possibly `wchar_t` on platforms where it is encoded in UTF-16 or UTF-32). At the time of this writing there is a work on including something similar to C++ standard library but, even if eventually approved, it will be a long time before it will become available. 
 
@@ -335,6 +336,50 @@ If your standard library supports user-supplied [range adapter closures](https:/
 as_utf8(u"😀😜") | std::views::take(1) | ...
 ```
 
+### Grapheme iteration
+
+Sometimes even UTF-32 iteration is not what you need. Many user perceived "characters" are actually composed from multiple
+UTF-32 codepoints. Unicode standard defines [grapheme cluster](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) as 
+what corresponds to a user notion of a character. A single grapheme cluster, or grapheme for short, can contain one or more Unicode
+codepoints. 
+
+This library allows you to easily iterate over grapheme clusters in `sys_string_t` content as well as in any C++ 
+[forward_range](https://en.cppreference.com/w/cpp/ranges/forward_range) of compatible character type.
+
+To iterate over graphemes you need to construct an instance of `grapheme_view` directly or use `graphemes` view adapter. In either case
+you need to supply a **view** of characters to iterate over. The view can be a reference to `sys_string_t::char_access`, `sys_string_t::utfX_access`
+or any other compatible forward view.
+
+The "values" returned from `grapheme_view` are `std::ranges::subrange` of the underlying view containing graphemes.
+
+To put it all in context here is how you can iterate over all graphemes in a `sys_string`.
+
+```cpp
+sys_string str = S("क्त्य");  //6 Unicode codepoints but one grapheme!
+sys_string::char_access access(str);
+for (auto grapheme_range: graphemes(access)) {
+    //grapheme_range is a subrange of sys_string::char_access::iterator
+    sys_string grapheme(grapheme_range);
+}
+```
+
+A `grapheme_view` is reversible, that is it can be iterated in both directions. Here is how to accomplish a common task -
+safely remove the last "character" from a string (see [Substrings](#Substrings) below for details on how to obtain parts of a string):
+
+```cpp
+sys_string str = S("abक्त्य");
+sys_string::char_access access(str);
+auto gr_view = graphemes(access);
+if (auto rit = gr_view.rbegin(); rit != gr_view.rend()) {
+    auto grapheme = *rit;
+    str = sys_string(access.begin(), grapheme.begin());
+}
+assert(str == S("ab"));
+```
+
+You can easily extend this to removing any number of trailing characters.
+
+
 ## Substrings
 
 You can obtain a substring of a `sys_string` in two ways:

diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
@@ -17,8 +17,6 @@ if (NOT SYSSTR_VERSION)
 endif()
 project(sys_string VERSION ${SYSSTR_VERSION})
 
-find_package (Python3 COMPONENTS Interpreter)
-
 set(SRCDIR ${CMAKE_CURRENT_LIST_DIR})
 set(LIBNAME sys_string)
 
@@ -64,6 +62,7 @@ PUBLIC
 set(MAIN_CODE 
     ${SRCDIR}/inc/sys_string/config.h
     ${SRCDIR}/inc/sys_string/utf_view.h
+    ${SRCDIR}/inc/sys_string/grapheme_view.h
     ${SRCDIR}/inc/sys_string/sys_string.h
     ${SRCDIR}/inc/sys_string/impl/compare.h
     ${SRCDIR}/inc/sys_string/impl/hash.h
@@ -129,23 +128,6 @@ PRIVATE
 
 add_library(${LIBNAME}::${LIBNAME} ALIAS ${LIBNAME})
 
-
-if(${Python3_Interpreter_FOUND})
-
-    file(GLOB UNICODE_DATA ${SRCDIR}/res/*.txt)
-    file(GLOB UNICODE_SCRIPTS ${SRCDIR}/scripts/*.py)
-
-    add_custom_command(
-        COMMENT "Generating Unicoode mappings"
-        OUTPUT ${SRCDIR}/src/unicode_mappings.cpp
-        COMMAND ${Python3_EXECUTABLE} ${SRCDIR}/scripts/genmappings.py ${SRCDIR}/res ${SRCDIR}/src/unicode_mappings.cpp 
-        DEPENDS 
-            ${UNICODE_DATA} 
-            ${UNICODE_SCRIPTS}
-    )
-
-endif()
-
 if (PROJECT_IS_TOP_LEVEL)
 
     include(cmake/install.cmake)

diff --git a/lib/inc/sys_string/config.h b/lib/inc/sys_string/config.h
@@ -84,6 +84,12 @@
     #error Please define how to force inline for your compiler
 #endif
 
+#if defined(_MSC_VER)
+    #define SYS_STRING_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
+#else
+    #define SYS_STRING_NO_UNIQUE_ADDRESS [[no_unique_address]]
+#endif
+
 //GCC up to 11.3 has a weird constexpr bug in some palces
 #if __GNUC__ > 11 || (__GNUC__ == 11 && __GNUC_MINOR__ > 2)
     #define BUGGY_CONSTEXPR constexpr
@@ -105,7 +111,7 @@
 
 //See https://github.com/llvm/llvm-project/issues/77773 for the sad story of how feature test
 //macros are useless with libc++
-#if __cpp_lib_format >= 201907L || (defined(_LIBCPP_VERSION) && __has_include(<format>))
+#if __cpp_lib_format >= 201907L || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 170000 && __has_include(<format>))
 
     #define SYS_STRING_SUPPORTS_STD_FORMAT 1