Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
gershnik committed Jan 10, 2025
2 parents 1dcab97 + cf87a43 commit e1edb6f
Show file tree
Hide file tree
Showing 44 changed files with 10,887 additions and 882 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:
- 'tools/**'

env:
BUILD_TYPE: Release
BUILD_TYPE: MinSizeRel
NDK_VER: 27.2.12479018
NDK_ARCH: x86_64
NDK_API: 29
Expand All @@ -35,7 +35,7 @@ jobs:
- {os: ubuntu-latest, compiler: gcc, version: 13 }
- {os: ubuntu-24.04, compiler: gcc, version: 14 }

- {os: ubuntu-latest, compiler: clang, version: 16 }
- {os: ubuntu-22.04, compiler: clang, version: 16 }
- {os: ubuntu-latest, compiler: clang, version: 17 }
- {os: ubuntu-latest, compiler: clang, version: 18 }

Expand All @@ -51,9 +51,10 @@ jobs:
wget https://apt.llvm.org/llvm.sh
chmod u+x llvm.sh
sudo ./llvm.sh ${{ matrix.version }}
sudo apt-get install -y clang-tools-${{ matrix.version }}
sudo apt-get install -y clang-tools-${{ matrix.version }} libc++-${{ matrix.version }}-dev libc++abi-${{ matrix.version }}-dev
echo "CC=clang-${{ matrix.version }}" >> $GITHUB_ENV
echo "CXX=clang++-${{ matrix.version }}" >> $GITHUB_ENV
echo "CXXFLAGS=-stdlib=libc++" >> $GITHUB_ENV
fi
if [[ '${{ matrix.compiler }}' == 'gcc' ]]; then
Expand Down
23 changes: 23 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## Unreleased

### Added

- `grapheme_view` and `graphemes` adapter which provide ability to iterate over grapheme clusters in `sys_string` and any UTF range.

### Changed

- Unicode data has been optimized for better size/speed balance
- `sys_string_t::hash_type` has been changed from `unsigned` to `size_t` on some platforms.

### Fixed

- Invalid character access in unicode mappings.
- Crash when sys_string_builder is re-used after `build()` on Apple and Python platforms.
- `utf_ref_view` and `utf_owning_view` now actually work with forward and input underlying ranges
- MSVC warnings when using `std::hash<sys_string>`

## [3.0] - 2024-12-02

This is a major release with some breaking changes
Expand Down Expand Up @@ -57,6 +73,12 @@ This is a major release with some breaking changes
behavior applies to `wchar_t` on platform where it is UTF-16 or UTF-32.
- `operator<<` no longer pollutes global namespace

## [2.15] - 2025-01-07

### Fixed

- Invalid character access in unicode mappings.

## [2.14] - 2024-05-02

### Fixed
Expand Down Expand Up @@ -207,4 +229,5 @@ This is a major release with some breaking changes
[2.12]: https://github.com/gershnik/sys_string/releases/v2.12
[2.13]: https://github.com/gershnik/sys_string/releases/v2.13
[2.14]: https://github.com/gershnik/sys_string/releases/v2.14
[2.15]: https://github.com/gershnik/sys_string/releases/v2.15
[3.0]: https://github.com/gershnik/sys_string/releases/v3.0
32 changes: 32 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,41 @@ string(STRIP ${SYSSTR_VERSION} SYSSTR_VERSION)

project(sys_string VERSION ${SYSSTR_VERSION} LANGUAGES CXX)

find_package (Python3 COMPONENTS Interpreter Development)

add_subdirectory(lib)

if(${Python3_Interpreter_FOUND})

file(GLOB UNICODE_DATA ${CMAKE_CURRENT_LIST_DIR}/unicode/data/*.txt)
file(GLOB UNICODE_SCRIPTS ${CMAKE_CURRENT_LIST_DIR}/unicode/scripts/*.py)

set(UNICODE_GENERATED_FILES
${CMAKE_CURRENT_LIST_DIR}/lib/src/unicode_mappings.cpp
${CMAKE_CURRENT_LIST_DIR}/lib/inc/sys_string/impl/unicode/mappings.h
${CMAKE_CURRENT_LIST_DIR}/test/test_grapheme_data.h
)

add_custom_command(
COMMENT "Generating Unicoode mappings"
OUTPUT ${UNICODE_GENERATED_FILES}
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/unicode/scripts/genmappings.py
${CMAKE_CURRENT_LIST_DIR}/unicode/data
${UNICODE_GENERATED_FILES}
DEPENDS
${UNICODE_DATA}
${UNICODE_SCRIPTS}
)

add_custom_target(generate_unicode_mappings
DEPENDS
${UNICODE_GENERATED_FILES}
)

add_dependencies(sys_string generate_unicode_mappings)

endif()

if (PROJECT_IS_TOP_LEVEL)

include(lib/cmake/install.cmake)
Expand Down
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
[![License](https://img.shields.io/badge/license-BSD-brightgreen.svg)](https://opensource.org/licenses/BSD-3-Clause)
[![Tests](https://github.com/gershnik/sys_string/actions/workflows/test.yml/badge.svg)](https://github.com/gershnik/sys_string/actions/workflows/test.yml)

This library provides a C++ string class template `sys_string_t` that is optimized for **interoperability with external native string type**. It is **immutable**, **Unicode-first** and exposes convenient **operations similar to Python or ECMAScript strings**. It uses a separate `sys_string_builder_t` class template to construct strings. It provides fast concatenation via `+` operator that **does not allocate temporary strings**.
The library exposes bidirectional UTF-8/UTF-16/UTF-32 views of `sys_string_t` as well as of any C++ input ranges of chracters.
of characters.
This library provides a C++ string class template `sys_string_t` that is optimized for **interoperability with external native string types**. It is **immutable**, **Unicode-first** and exposes convenient **operations similar to Python or ECMAScript strings**. It uses a separate `sys_string_builder_t` class template to construct strings. It provides fast concatenation via `+` operator that **does not allocate temporary strings**.
The library exposes bidirectional UTF-8/UTF-16/UTF-32 and grapheme cluster views of `sys_string_t` as well as of other C++ ranges of characters.

## What does it mean?

Expand Down Expand Up @@ -38,11 +37,16 @@ of characters.

* **Concatenation does not allocate temporaries.** You can safely do things like `result = s1 + s2 + s3`. It will result in **one** memory allocation and 3 calls to `memcpy` to copy each of `s1`, `s2` and `s3` content into the final result. Not 2 allocations and 5 copies like in other languages or with `std::string`.

* **Bidirectional UTF-8/UTF-16/UTF-32 views**. You can view `sys_string_t` as a sequence of UTF-8/16/32 characters and iterate forward or __backward__ equally efficiently. Consider trying to find last instance of Unicode whitespace in UTF-8 data. Doing it as fast as finding the first instance is non-trivial. The views also work on any random access containers (C array, `std::array`, `std::vector`, `std::string`) of characters. Thus you can iterate in UTF-8 over `std::vector<char16_t>` etc.
* **Bidirectional UTF-8/UTF-16/UTF-32 views**. You can view `sys_string_t` as a sequence of UTF-8/16/32 characters and iterate forward or __backward__ equally efficiently. Consider trying to find last instance of Unicode whitespace in UTF-8 data. Doing it as fast as finding the first instance is non-trivial. The views also work on any C++ input ranges (C array, `std::array`, `std::vector`, `std::string` or even `std::ranges::istream_view`) of characters (`char`, `char8_t`, `char16_t`, `char32_t` and `wchar_t` on platforms where it is Unicode). Thus you can iterate in UTF-8 over `std::vector<char16_t>` etc.

* **Bidirectional grapheme cluster views**. Similarly you can also further view any of the UTF-8/UTF-16/UTF-32 views of `sys_string_t` as a sequence of
[grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) and iterate over them forward or __backward__ equally efficiently.
Consider the task of erasing the last user perceived "character" from a string. To do so correctly you need to erase the last _grapheme cluster_. Doing it and doing it fast is very non-trivial. This functionality also works on any C++ range of characters (but requires a _forward_ range).

## Why bother? Doesn't `std::string` work well?

An `std::string` storing UTF-8 (or `std::wstring` storing UTF-16 on Windows) works very well for some scenarios but fails miserably for others. `sys_string` class is an attempt to create something that works well in situations `std::string` would be a bad choice.

Specifically, `std::basic_string` is an STL container of a character type that owns its memory and controls it via a user-supplied allocator. These design choices make it very fast for direct character access but create the following problems:

* They rule out (efficient) reference-counted implementations. Which means that when you copy an `std::string` instance it must copy its content. Some of the penalty of that is alleviated by modern [small string optimization](https://akrzemi1.wordpress.com/2014/04/14/common-optimizations/) but this is, at best, a band-aid. There are workarounds, of course, such as using `std::shared_ptr<std::string>>` "when it matters" but they result in even more complexity for something that is quite fundamental to any data processing.
Expand Down
49 changes: 47 additions & 2 deletions doc/Usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
- [Iterating over string content](#iterating-over-string-content)
- [Storage iteration](#storage-iteration)
- [UTF iteration](#utf-iteration)
- [Helper: UTF iteration over any C++ character range](#helper-utf-iteration-over-any-c-character-range)
- [UTF iteration over any C++ character range](#utf-iteration-over-any-c-character-range)
- [Grapheme iteration](#grapheme-iteration)
- [Substrings](#substrings)
- [Accessing C strings](#accessing-c-strings)
- [Accessing storage as C array](#accessing-storage-as-c-array)
Expand Down Expand Up @@ -307,7 +308,7 @@ utf32_access::iterator first = access.reverse(access.rend());
```

### Helper: UTF iteration over any C++ character range
### UTF iteration over any C++ character range

Since the internal facility to perform UTF iteration is quite generic this library exposes it to allow you to perform UTF iteration over any C++ input range of compatible characters (`char`, `char8_t`, `char16_t`, `char32_t`, and possibly `wchar_t` on platforms where it is encoded in UTF-16 or UTF-32). At the time of this writing there is a work on including something similar to C++ standard library but, even if eventually approved, it will be a long time before it will become available.

Expand Down Expand Up @@ -335,6 +336,50 @@ If your standard library supports user-supplied [range adapter closures](https:/
as_utf8(u"😀😜") | std::views::take(1) | ...
```
### Grapheme iteration
Sometimes even UTF-32 iteration is not what you need. Many user perceived "characters" are actually composed from multiple
UTF-32 codepoints. Unicode standard defines [grapheme cluster](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries) as
what corresponds to a user notion of a character. A single grapheme cluster, or grapheme for short, can contain one or more Unicode
codepoints.
This library allows you to easily iterate over grapheme clusters in `sys_string_t` content as well as in any C++
[forward_range](https://en.cppreference.com/w/cpp/ranges/forward_range) of compatible character type.
To iterate over graphemes you need to construct an instance of `grapheme_view` directly or use `graphemes` view adapter. In either case
you need to supply a **view** of characters to iterate over. The view can be a reference to `sys_string_t::char_access`, `sys_string_t::utfX_access`
or any other compatible forward view.
The "values" returned from `grapheme_view` are `std::ranges::subrange` of the underlying view containing graphemes.
To put it all in context here is how you can iterate over all graphemes in a `sys_string`.
```cpp
sys_string str = S("क्त्य"); //6 Unicode codepoints but one grapheme!
sys_string::char_access access(str);
for (auto grapheme_range: graphemes(access)) {
//grapheme_range is a subrange of sys_string::char_access::iterator
sys_string grapheme(grapheme_range);
}
```

A `grapheme_view` is reversible, that is it can be iterated in both directions. Here is how to accomplish a common task -
safely remove the last "character" from a string (see [Substrings](#Substrings) below for details on how to obtain parts of a string):

```cpp
sys_string str = S("abक्त्य");
sys_string::char_access access(str);
auto gr_view = graphemes(access);
if (auto rit = gr_view.rbegin(); rit != gr_view.rend()) {
auto grapheme = *rit;
str = sys_string(access.begin(), grapheme.begin());
}
assert(str == S("ab"));
```
You can easily extend this to removing any number of trailing characters.
## Substrings
You can obtain a substring of a `sys_string` in two ways:
Expand Down
20 changes: 1 addition & 19 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ if (NOT SYSSTR_VERSION)
endif()
project(sys_string VERSION ${SYSSTR_VERSION})

find_package (Python3 COMPONENTS Interpreter)

set(SRCDIR ${CMAKE_CURRENT_LIST_DIR})
set(LIBNAME sys_string)

Expand Down Expand Up @@ -64,6 +62,7 @@ PUBLIC
set(MAIN_CODE
${SRCDIR}/inc/sys_string/config.h
${SRCDIR}/inc/sys_string/utf_view.h
${SRCDIR}/inc/sys_string/grapheme_view.h
${SRCDIR}/inc/sys_string/sys_string.h
${SRCDIR}/inc/sys_string/impl/compare.h
${SRCDIR}/inc/sys_string/impl/hash.h
Expand Down Expand Up @@ -129,23 +128,6 @@ PRIVATE

add_library(${LIBNAME}::${LIBNAME} ALIAS ${LIBNAME})


if(${Python3_Interpreter_FOUND})

file(GLOB UNICODE_DATA ${SRCDIR}/res/*.txt)
file(GLOB UNICODE_SCRIPTS ${SRCDIR}/scripts/*.py)

add_custom_command(
COMMENT "Generating Unicoode mappings"
OUTPUT ${SRCDIR}/src/unicode_mappings.cpp
COMMAND ${Python3_EXECUTABLE} ${SRCDIR}/scripts/genmappings.py ${SRCDIR}/res ${SRCDIR}/src/unicode_mappings.cpp
DEPENDS
${UNICODE_DATA}
${UNICODE_SCRIPTS}
)

endif()

if (PROJECT_IS_TOP_LEVEL)

include(cmake/install.cmake)
Expand Down
8 changes: 7 additions & 1 deletion lib/inc/sys_string/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@
#error Please define how to force inline for your compiler
#endif

#if defined(_MSC_VER)
#define SYS_STRING_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
#else
#define SYS_STRING_NO_UNIQUE_ADDRESS [[no_unique_address]]
#endif

//GCC up to 11.3 has a weird constexpr bug in some palces
#if __GNUC__ > 11 || (__GNUC__ == 11 && __GNUC_MINOR__ > 2)
#define BUGGY_CONSTEXPR constexpr
Expand All @@ -105,7 +111,7 @@

//See https://github.com/llvm/llvm-project/issues/77773 for the sad story of how feature test
//macros are useless with libc++
#if __cpp_lib_format >= 201907L || (defined(_LIBCPP_VERSION) && __has_include(<format>))
#if __cpp_lib_format >= 201907L || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 170000 && __has_include(<format>))

#define SYS_STRING_SUPPORTS_STD_FORMAT 1

Expand Down
Loading

0 comments on commit e1edb6f

Please sign in to comment.