diff --git a/best/base/fwd.h b/best/base/fwd.h index aac83a5..b592807 100644 --- a/best/base/fwd.h +++ b/best/base/fwd.h @@ -41,15 +41,21 @@ class track_location; template class tlist; -// best/strings/rune.h +// best/text/rune.h class rune; -// best::str cannot be forward-declared because that depends on +// best::text cannot be forward-declared because that depends on // best::encoding being defined. // -// best/strings/str.h +// best/text/str.h // template -// class str; +// class text; + +// best/text/utf.h +struct utf8; +struct wtf8; +struct utf16; +struct utf32; // best/test/test.h class test; diff --git a/best/base/port.h b/best/base/port.h index ab67c3a..9e74140 100644 --- a/best/base/port.h +++ b/best/base/port.h @@ -111,16 +111,25 @@ inline constexpr bool is_debug() { /// be a string literal of the form `"-Wmy-diagnostic"`. #define BEST_IGNORE_GCC_DIAGNOSTIC(W_) BEST_PRAGMA(GCC diagnostic ignored W_) +/// # `BEST_LINK_NAME()` +/// +/// Specifies the linker symbol of a particular function declaration, overriding +/// the usual mangling. This should be placed after the argument list. +#define BEST_LINK_NAME(sym_) asm(sym_) + // HACK: Wait for BestFmt. -template -Os& operator<<(Os& os, const std::pair& pair) { - return os << "(" << pair.first << ", " << pair.second << ")"; -} template Os& operator<<(Os& os, std::byte b) { return os << "0x" << std::hex << int(b); } - +template +Os& operator<<(Os& os, const std::pair& pair) + requires requires { + { os << pair.first << pair.second }; + } +{ + return os << "(" << pair.first << ", " << pair.second << ")"; +} } // namespace best #endif // BEST_BASE_PORT_H_ \ No newline at end of file diff --git a/best/container/BUILD b/best/container/BUILD index 2317f6a..636df16 100644 --- a/best/container/BUILD +++ b/best/container/BUILD @@ -119,6 +119,7 @@ cc_library( ], deps = [ ":option", + "//best/memory:bytes", ], ) @@ -142,6 +143,7 @@ cc_library( "//best/math:bit", "//best/math:overflow", "//best/memory:allocator", + "//best/memory:bytes", "//best/memory:layout", "//best/meta:concepts", "//best/meta:init", diff --git a/best/container/object.h b/best/container/object.h index 5840835..0183022 100644 --- a/best/container/object.h +++ b/best/container/object.h @@ -359,7 +359,7 @@ class object_ptr final { /// This type wraps any `T` and reproduces its properties. The wrapped `T` can /// be accessed via `operator*` and `operator->`. template -class object { +class object final { public: /// # `object::wrapped_type` /// diff --git a/best/container/option.h b/best/container/option.h index 36c5cf1..23173b2 100644 --- a/best/container/option.h +++ b/best/container/option.h @@ -472,7 +472,11 @@ class option final { // TODO: BestFmt template - friend Os& operator<<(Os& os, const option& opt) { + friend Os& operator<<(Os& os, const option& opt) + requires best::void_type || requires { + { os << *opt }; + } + { if (!opt.has_value()) { return os << "none"; } else if constexpr (best::void_type) { @@ -600,7 +604,7 @@ Os& operator<<(Os& os, none_t opt) { inline constexpr best::option VoidOption{best::in_place}; // Forward declare span as soon as possible. -template > +template = best::none> class span; /// --- IMPLEMENTATION DETAILS BELOW --- diff --git a/best/container/pun_test.cc b/best/container/pun_test.cc index 02dd335..3ebcce5 100644 --- a/best/container/pun_test.cc +++ b/best/container/pun_test.cc @@ -63,7 +63,7 @@ best::test NonTrivial = [](auto& t) { best::test String = [](auto& t) { unsafe::in([&](auto u) { - best::pun s(best::index<0>, "hello..."); + best::pun s(best::index<0>, best::str("hello...")); t.expect_eq(s.get<0>(u), "hello..."); }); }; diff --git a/best/container/span.h b/best/container/span.h index cc9c3db..b30f7bb 100644 --- a/best/container/span.h +++ b/best/container/span.h @@ -13,6 +13,7 @@ #include "best/container/option.h" #include "best/log/location.h" #include "best/math/overflow.h" +#include "best/memory/bytes.h" #include "best/meta/concepts.h" #include "best/meta/init.h" #include "best/meta/ops.h" @@ -117,7 +118,9 @@ inline constexpr best::option span_extent = /// /// Unfortunately, it is not possible to make `best::span` work when `T` is /// not an object type. -template n = best::none> +template n /* = best::none */> class span final { public: /// Helper type aliases. @@ -235,6 +238,25 @@ class span final { requires is_const : span(std::data(il), std::size(il), loc) {} + /// # `span::from_nul()` + /// + /// Constructs a new span pointing to a NUL-terminated string (i.e., a string + /// of span elements, the last of which is zero). + /// + /// If `data` is null, returns an empty span. + /// + /// If this is a fixed-with span, this will perform the usual fatal bounds + /// check upon construction. + constexpr static span from_nul(T* data) { + if (data == nullptr) return {data, 0}; + + auto ptr = data; + while (*ptr++ != T{0}) + ; + + return best::span(data, ptr - data - 1); + } + private: template static constexpr best::option minus = @@ -581,14 +603,14 @@ class span final { constexpr void copy_from(best::span src) const requires(!is_const) { - size_t to_copy = std::min({size(), src.size()}); if (!std::is_constant_evaluated() && best::same && best::copyable) { - std::memcpy(data(), src.data(), to_copy * size_of); + best::copy_bytes(*this, src); return; } unsafe::in([&](auto u) { + size_t to_copy = best::min(size(), src.size()); for (size_t i = 0; i < to_copy; ++i) { at(u, i) = src.at(u, i); } @@ -602,13 +624,13 @@ class span final { constexpr void emplace_from(best::span src) const requires(!is_const) { - size_t to_copy = std::min({size(), src.size()}); if (!std::is_constant_evaluated() && best::same && best::copyable) { - std::memcpy(data(), src.data(), to_copy * size_of); + best::copy_bytes(*this, src); return; } + size_t to_copy = best::min(size(), src.size()); for (size_t i = 0; i < to_copy; ++i) { (data() + i).copy_from(src.data() + i, false); } @@ -656,7 +678,11 @@ class span final { // TODO: BestFmt template - friend Os& operator<<(Os& os, span sp) { + friend Os& operator<<(Os& os, span sp) + requires requires { + { os << sp[0] }; + } + { os << "["; bool first = true; for (auto&& value : sp) { @@ -669,31 +695,9 @@ class span final { } // All spans are comparable. - template m> + template m> constexpr bool operator==(best::span that) const - requires best::equatable - { - if (size() != that.size()) { - return false; - } - - if constexpr (best::can_memcmp && best::can_memcmp && - best::same) { - return data() == that.data() || // Optimize for the case where we are - // comparing a span to itself! - std::memcmp(data().raw(), that.data().raw(), - size() * size_of) == 0; - } - - for (size_t i = 0; i < size(); ++i) { - if (data()[i] != that.data()[i]) { - return false; - } - } - - return true; - } - + requires best::equatable; template constexpr bool operator==(const R& range) const requires best::equatable && (!is_span) @@ -701,37 +705,9 @@ class span final { return *this == best::span(range); } - template m> - constexpr best::order_type operator<=>(best::span that) const { - if constexpr (best::can_memcmp && best::can_memcmp && - best::same) { - if (data() == that.data()) { - return size() <=> that.size(); - } - - size_t prefix = std::min({size(), that.size()}); - int result = - std::memcmp(data().raw(), that.data().raw(), prefix * size_of); - if (result < 0) { - return std::strong_ordering::less; - } else if (result > 0) { - return std::strong_ordering::greater; - } else { - return size() <=> that.size(); - } - } - - size_t prefix = std::min({size(), that.size()}); - - for (size_t i = 0; i < prefix; ++i) { - if (auto result = data()[i] <=> that.data()[i]; result != 0) { - return result; - } - } - - return size() <=> that.size(); - } - + template m> + constexpr auto operator<=>(best::span that) const + requires best::comparable; template constexpr auto operator<=>(const R& range) const requires best::comparable && (!is_span) @@ -747,6 +723,18 @@ class span final { span_internal::repr repr_; }; +template +span(T*, size_t) -> span; +template +span(best::object_ptr, size_t) -> span; +template +span(std::initializer_list) -> span; +template +span(R&& r) -> span, + best::static_size>; + +// --- IMPLEMENTATION DETAILS BELOW --- + template n> struct span::iter final { constexpr iter() = default; @@ -798,15 +786,44 @@ inline constexpr size_t BestStaticSize(auto, std::array*) { return n; } -template -span(T*, size_t) -> span; -template -span(best::object_ptr, size_t) -> span; -template -span(std::initializer_list) -> span; -template -span(R&& r) -> span, - best::static_size>; +template n> +template m> +constexpr bool span::operator==(span that) const + requires best::equatable +{ + if (size() != that.size()) return false; + if constexpr (best::byte_comparable) { + return best::equate_bytes(span(*this), span(that)); + } + + for (size_t i = 0; i < size(); ++i) { + if (data()[i] != that.data()[i]) { + return false; + } + } + + return true; +} + +template n> +template m> +constexpr auto span::operator<=>(span that) const + requires best::comparable +{ + if constexpr (best::byte_comparable) { + return best::order_type( + best::compare_bytes(span(*this), span(that))); + } + + size_t prefix = best::min(size(), that.size()); + for (size_t i = 0; i < prefix; ++i) { + if (auto result = data()[i] <=> that.data()[i]; result != 0) { + return result; + } + } + + return best::order_type(size() <=> that.size()); +} template n> constexpr void span::shift_within(unsafe u, size_t dst, size_t src, @@ -847,7 +864,8 @@ constexpr void span::shift_within(unsafe u, size_t dst, size_t src, if (src + count <= dst || dst + count <= src) { // Non-overlapping case. if constexpr (best::relocatable) { - std::memcpy(data() + dst, data() + src, count * size_of); + best::copy_bytes(at(u, {.start = dst, .count = count}), + at(u, {.start = src, .count = count})); at(u, {.start = src, .count = count}).destroy_in_place(); return; } @@ -858,7 +876,8 @@ constexpr void span::shift_within(unsafe u, size_t dst, size_t src, } else if (src < dst && dst < src + count) { // Forward case. if constexpr (best::relocatable) { - std::memmove(data() + dst, data() + src, count * size_of); + best::copy_overlapping_bytes(at(u, {.start = dst, .count = count}), + at(u, {.start = src, .count = count})); at(u, {.start = src, .end = dst}).destroy_in_place(); return; } @@ -878,7 +897,8 @@ constexpr void span::shift_within(unsafe u, size_t dst, size_t src, } else if (dst < src && src < dst + count) { // Backward case. if constexpr (best::relocatable) { - std::memmove(data() + dst, data() + src, count * size_of); + best::copy_overlapping_bytes(at(u, {.start = dst, .count = count}), + at(u, {.start = src, .count = count})); at(u, {.start = dst + count, .end = src + count}).destroy_in_place(); return; } diff --git a/best/container/span_test.cc b/best/container/span_test.cc index 19cb7a4..c6914be 100644 --- a/best/container/span_test.cc +++ b/best/container/span_test.cc @@ -63,6 +63,13 @@ best::test InitList = [](auto& t) { cb({1, 2, 3}); }; +best::test FromNul = [](auto& t) { + int ints[] = {1, 2, 3, 0, 4, 5, 6, 0}; + auto span = best::span::from_nul(ints); + t.expect_eq(span, best::span{1, 2, 3}); + t.expect_eq(best::span::from_nul(nullptr), best::span{}); +}; + best::test Ordering = [](auto& t) { int32_t ints[] = {1, 2, 3}; int64_t longs[] = {1, 2, 3}; diff --git a/best/container/vec.h b/best/container/vec.h index 6db7c24..b6f6d86 100644 --- a/best/container/vec.h +++ b/best/container/vec.h @@ -1,7 +1,6 @@ #ifndef BEST_CONTAINER_VEC_H_ #define BEST_CONTAINER_VEC_H_ -#include #include #include #include @@ -830,7 +829,7 @@ void vec::spill_to_heap(best::option capacity_hint) { return; } - size_t new_size = std::max({capacity(), capacity_hint.value_or(0)}); + size_t new_size = best::max(capacity(), capacity_hint.value_or(0)); // Always snap to a power of 2. if (!best::is_pow2(new_size)) { new_size = best::next_pow2(new_size); diff --git a/best/math/BUILD b/best/math/BUILD index 3360629..a14ad35 100644 --- a/best/math/BUILD +++ b/best/math/BUILD @@ -2,7 +2,10 @@ package(default_visibility = ["//visibility:public"]) cc_library( name = "int", - hdrs = ["int.h"], + hdrs = [ + "int.h", + "internal/common_int.h", + ], deps = [ "//best/base:fwd", "//best/meta:concepts", @@ -21,10 +24,7 @@ cc_test( cc_library( name = "overflow", - hdrs = [ - "overflow.h", - "internal/common_int.h", - ], + hdrs = ["overflow.h"], deps = [ ":int", "//best/base:fwd", diff --git a/best/math/int.h b/best/math/int.h index 4b5d90b..42f9ca9 100644 --- a/best/math/int.h +++ b/best/math/int.h @@ -9,6 +9,8 @@ #include "best/base/fwd.h" #include "best/base/hint.h" +#include "best/base/port.h" +#include "best/math/internal/common_int.h" #include "best/meta/concepts.h" //! Utilities for working with primitive integer types. @@ -44,6 +46,7 @@ inline constexpr Int min_of = std::numeric_limits::min(); /// The maximum value for a particular integer type. template inline constexpr Int max_of = std::numeric_limits::max(); + /// # `best::signed_int` /// /// Any primitive signed integer. @@ -123,6 +126,67 @@ BEST_INLINE_ALWAYS constexpr best::option checked_cast(integer auto x) { return x; } +/// # `best::common_int<...>` +/// +/// Computes a "common int" type among the given integers. +/// +/// This is defined to be the larges integer type among them. If any of them +/// are unsigned, the type is also unsigned. +template +using common_int = decltype(best::int_internal::common>()); + +/// # `best::min()` +/// +/// Computes the minimum from a collection of signed or unsigned integers. +template +BEST_INLINE_ALWAYS constexpr best::common_int min(Ints... args) + requires(sizeof...(args) > 0) +{ + BEST_PUSH_GCC_DIAGNOSTIC() + BEST_IGNORE_GCC_DIAGNOSTIC("-Wunused-value") + best::common_int output = (args, ...); + BEST_POP_GCC_DIAGNOSTIC() + ((output > args ? output = args : 0), ...); + return output; +} +template +BEST_INLINE_ALWAYS constexpr best::common_int min(Ints... args) + requires(sizeof...(args) > 0) +{ + BEST_PUSH_GCC_DIAGNOSTIC() + BEST_IGNORE_GCC_DIAGNOSTIC("-Wunused-value") + best::common_int output = (args, ...); + BEST_POP_GCC_DIAGNOSTIC() + ((output > args ? output = args : 0), ...); + return output; +} + +/// # `best::max()` +/// +/// Computes the maximum from a collection of signed or unsigned integers. +template +BEST_INLINE_ALWAYS constexpr best::common_int max(Ints... args) + requires(sizeof...(args) > 0) +{ + BEST_PUSH_GCC_DIAGNOSTIC() + BEST_IGNORE_GCC_DIAGNOSTIC("-Wunused-value") + best::common_int output = (args, ...); + BEST_POP_GCC_DIAGNOSTIC() + ((output < args ? output = args : 0), ...); + return output; +} +template +BEST_INLINE_ALWAYS constexpr best::common_int max(Ints... args) + requires(sizeof...(args) > 0) +{ + BEST_PUSH_GCC_DIAGNOSTIC() + BEST_IGNORE_GCC_DIAGNOSTIC("-Wunused-value") + best::common_int output = (args, ...); + BEST_POP_GCC_DIAGNOSTIC() + ((output < args ? output = args : 0), ...); + return output; +} + /// # `best::smallest_unsigned` /// /// Computes the smallest unsigned integer type that can represent `n`. diff --git a/best/math/overflow.h b/best/math/overflow.h index 2386284..087ce72 100644 --- a/best/math/overflow.h +++ b/best/math/overflow.h @@ -7,20 +7,11 @@ #include "best/log/internal/crash.h" #include "best/log/location.h" #include "best/math/int.h" -#include "best/math/internal/common_int.h" #include "best/meta/ops.h" //! Overflow-detection utilities. namespace best { -/// # `best::common_int<...>` -/// -/// Computes a "common int" type among the given integers. -/// -/// This is defined to be the larges integer type among them. If any of them -/// are unsigned, the type is also unsigned. -template -using common_int = decltype(best::int_internal::common>()); template struct overflow; diff --git a/best/memory/BUILD b/best/memory/BUILD index ab62912..04dedba 100644 --- a/best/memory/BUILD +++ b/best/memory/BUILD @@ -33,4 +33,17 @@ cc_library( ":layout", "//best/meta:init", ] +) + +cc_library( + name = "bytes", + hdrs = [ + "bytes.h", + "internal/bytes.h", + ], + srcs = ["bytes.cc"], + deps = [ + "//best/container:option", + "//best/meta:concepts", + ] ) \ No newline at end of file diff --git a/best/memory/allocator.cc b/best/memory/allocator.cc index 875eb0d..83723b1 100644 --- a/best/memory/allocator.cc +++ b/best/memory/allocator.cc @@ -1,6 +1,5 @@ #include "best/memory/allocator.h" -#include #include #include @@ -52,7 +51,7 @@ void* malloc::realloc(void* ptr, best::layout old, best::layout layout) { } void* p = alloc(layout); - size_t common = std::min({old.size(), layout.size()}); + size_t common = best::min(old.size(), layout.size()); std::memcpy(p, ptr, common); return p; } diff --git a/best/memory/bytes.cc b/best/memory/bytes.cc new file mode 100644 index 0000000..4ca4bf8 --- /dev/null +++ b/best/memory/bytes.cc @@ -0,0 +1,22 @@ +namespace best { +extern "C" { +// `best` assumes your libc is competent enough to have `memmem`. If not, +// here is a quadratic fallback you can enable if necessary. +#if 0 +void* memmem(const void* a, size_t an, const void* b, size_t bn) noexcept { + if (bn == 0) return const_cast(a); + + const char* ap = reinterpret_cast(a); + const char* bp = reinterpret_cast(b); + while (an >= bn && an > 0) { + if (bytes_internal::memcmp(ap, bp, bn) == 0) { + return const_cast(ap); + } + ++ap; + --an; + } + return nullptr; +} +#endif +} +} // namespace best \ No newline at end of file diff --git a/best/memory/bytes.h b/best/memory/bytes.h new file mode 100644 index 0000000..c70014b --- /dev/null +++ b/best/memory/bytes.h @@ -0,0 +1,129 @@ +#ifndef BEST_MEMORY_BYTES_H_ +#define BEST_MEMORY_BYTES_H_ + +#include "best/base/hint.h" +#include "best/container/option.h" +#include "best/memory/internal/bytes.h" + +//! Raw byte manipulation functions. +//! +//! This header provides convenient and type-safe versions of the `mem*()` +//! functions. + +namespace best { +/// # `best::byte_comparable` +/// +/// Whether a pair of types' equality is modeled by `memcmp()`. +template +concept byte_comparable = + (std::is_integral_v || std::is_enum_v || + std::is_pointer_v)&&(std::is_integral_v || std::is_enum_v || + std::is_pointer_v)&&sizeof(T) == sizeof(U); + +/// # `best::copy_bytes()` +/// +/// A typed wrapper over `memcpy()`. This will copy the largest common prefix of +/// the spans. +template +BEST_INLINE_ALWAYS void copy_bytes(best::span dst, best::span src) + requires(sizeof(T) == sizeof(U)) +{ + auto to_copy = best::min(dst.size(), src.size()) * sizeof(T); + if (to_copy == 0) return; + bytes_internal::memcpy(dst.data(), src.data(), to_copy); +} + +/// # `best::copy_overlapping_bytes()` +/// +/// A typed wrapper over `memmove()`. This will copy the largest common prefix +/// of the spans. +template +BEST_INLINE_ALWAYS void copy_overlapping_bytes(best::span dst, + best::span src) + requires(sizeof(T) == sizeof(U)) +{ + auto to_copy = best::min(dst.size(), src.size()) * sizeof(T); + if (to_copy == 0) return; + bytes_internal::memmove(dst.data(), src.data(), to_copy); +} + +/// # `best::fill_bytes()` +/// +/// A typed wrapper over `memset()`. +template +BEST_INLINE_ALWAYS void fill_bytes(best::span dst, uint8_t fill) { + if (dst.is_empty()) return; + bytes_internal::memset(dst.data(), fill, dst.size() * sizeof(T)); +} + +/// # `best::compare_bytes()` +/// +/// A typed wrapper over `memcmp()` that is optimized for performing equality +/// comparisons. +template +BEST_INLINE_ALWAYS bool equate_bytes(best::span lhs, best::span rhs) + requires(sizeof(T) == sizeof(U)) +{ + if (lhs.size() != rhs.size()) return false; + if (lhs.data() == rhs.data() || lhs.is_empty()) return true; + + return 0 == + bytes_internal::memcmp(lhs.data(), rhs.data(), lhs.size() * sizeof(T)); +} + +/// # `best::compare_bytes()` +/// +/// A typed wrapper over `memcmp()`. This performs total lexicographic +/// comparison between two spans. +template +BEST_INLINE_ALWAYS std::strong_ordering compare_bytes(best::span lhs, + best::span rhs) + requires(sizeof(T) == sizeof(U)) +{ + if (lhs.data() == rhs.data() || lhs.is_empty() || rhs.is_empty()) { + return lhs.size() <=> rhs.size(); + } + + auto to_compare = best::min(lhs.size(), rhs.size()) * sizeof(T); + int result = bytes_internal::memcmp(lhs.data(), rhs.data(), to_compare); + if (result == 0) { + return lhs.size() <=> rhs.size(); + } + return result <=> 0; +} + +/// # `best::search_bytes()` +/// +/// A typed wrapper over `memmem()`, a GNU extension that is the `mem` version +/// of `strstr()`. This finds the index of the first occurrence of `needle` in +/// `haystack`. +/// +/// Note that this will find the first *aligned* index, which is to say that +/// calls to `memmem()` will continue until it either fails or returns an +/// aligned index. +template +BEST_INLINE_ALWAYS best::option search_bytes(best::span haystack, + best::span needle) + requires(sizeof(T) == sizeof(U)) +{ + auto* data = + reinterpret_cast(static_cast(haystack.data())); + size_t size = haystack.size() * sizeof(T); + size_t travel = 0; + + do { + void* found = bytes_internal::memmem(data, size, needle.data(), + needle.size() * sizeof(T)); + if (!found) return best::none; + + size_t offset = static_cast(found) - data; + travel += offset; + data += offset; + size -= offset; + } while (travel % alignof(T) != 0); + + return travel / sizeof(T); +} +} // namespace best + +#endif // BEST_MEMORY_BYTES_H_ \ No newline at end of file diff --git a/best/memory/internal/bytes.h b/best/memory/internal/bytes.h new file mode 100644 index 0000000..0ba072f --- /dev/null +++ b/best/memory/internal/bytes.h @@ -0,0 +1,16 @@ +#ifndef BEST_MEMORY_INTERNAL_BYTES_H_ +#define BEST_MEMORY_INTERNAL_BYTES_H_ + +#include + +namespace best::bytes_internal { +extern "C" { +void* memcpy(void*, const void*, size_t) noexcept; +void* memmove(void*, const void*, size_t) noexcept; +void* memmem(const void*, size_t, const void*, size_t) noexcept; +void* memset(void*, int, size_t) noexcept; +int memcmp(const void*, const void*, size_t) noexcept; +} // extern "C" +} // namespace best::bytes_internal + +#endif // BEST_MEMORY_INTERNAL_BYTES_H_ \ No newline at end of file diff --git a/best/meta/BUILD b/best/meta/BUILD index c0561c0..08da9b0 100644 --- a/best/meta/BUILD +++ b/best/meta/BUILD @@ -44,6 +44,16 @@ cc_test( ] ) +cc_library( + name = "ebo", + hdrs = ["ebo.h"], + deps = [ + ":concepts", + ":init", + ":tags", + ] +) + cc_library( name = "tlist", hdrs = [ diff --git a/best/meta/concepts.h b/best/meta/concepts.h index f9838ea..812bf22 100644 --- a/best/meta/concepts.h +++ b/best/meta/concepts.h @@ -101,11 +101,6 @@ concept abominable_func_type = std::is_function_v; template concept qualifies_to = same || same || same || same; - -/// Whether a type T can be safely memcmp'd. -template -concept can_memcmp = - std::is_integral_v || std::is_enum_v || std::is_pointer_v; } // namespace best #endif // BEST_META_CONCEPTS_H_ \ No newline at end of file diff --git a/best/meta/ebo.h b/best/meta/ebo.h new file mode 100644 index 0000000..96b32cd --- /dev/null +++ b/best/meta/ebo.h @@ -0,0 +1,93 @@ +#ifndef BEST_META_EBO_H_ +#define BEST_META_EBO_H_ + +#include + +#include + +#include "best/meta/concepts.h" +#include "best/meta/init.h" +#include "best/meta/tags.h" + +//! A helper for the empty base class optimization. + +namespace best { +/// # `best::ebo` +/// +/// A wrapper type over a `T` that is an empty type if `T` is empty. This allows +/// performing the empty base optimization even if `T` is `final`. However, +/// it must be trivially relocatable. +/// +/// The "canonical" way to use this type is as follows: +/// +/// ``` +/// template +/// class MyClass : best::ebo { +/// private: +/// using value_ = best::ebo +/// +/// public: +/// void frob() { value_::get().frob(); } +/// } +/// ``` +/// +/// Doing a CTRP-type here ensures that this base is reasonably unique among +/// other potential bases of `MyClass`. +template && best::relocatable> +class ebo /* not final! */ { + public: + /// # `ebo::ebo(...)` + /// + /// Constructs a new `ebo` by calling the constructor of the wrapped type + constexpr ebo(best::in_place_t, auto&&... args) : value_(BEST_FWD(args)...) {} + + constexpr ebo() + requires best::constructible + = default; + constexpr ebo(const ebo&) = default; + constexpr ebo& operator=(const ebo&) = default; + constexpr ebo(ebo&&) = default; + constexpr ebo& operator=(ebo&&) = default; + + /// # `ebo::get()` + /// + /// Returns the wrapped value. + constexpr const T& get() const { return value_; } + constexpr T& get() { return value_; } + + private: + T value_; +}; + +template +class ebo /* not final! */ { + public: + constexpr ebo(best::in_place_t, auto&&... args) { + (void)T(BEST_FWD(args)...); + } + + constexpr ebo() + requires best::constructible + = default; + constexpr ebo() + requires(best::constructible && !best::constructible) + { + (void)T(); + }; + constexpr ebo(const ebo&) = default; + constexpr ebo& operator=(const ebo&) = default; + constexpr ebo(ebo&&) = default; + constexpr ebo& operator=(ebo&&) = default; + + constexpr const T& get() const { return value_; } + constexpr T& get() { return value_; } + + private: + inline static T value_; +}; + +} // namespace best + +#endif // BEST_META_EBO_H_ \ No newline at end of file diff --git a/best/meta/internal/ops.h b/best/meta/internal/ops.h index 58f1632..b30d675 100644 --- a/best/meta/internal/ops.h +++ b/best/meta/internal/ops.h @@ -121,31 +121,31 @@ BEST_INLINE_SYNTHETIC constexpr auto run(tag, auto&& func, } template -BEST_INLINE_SYNTHETIC constexpr auto call(F Class::*member, Class&& self, +BEST_INLINE_SYNTHETIC constexpr auto call(F Class::*member, auto&& self, auto&&... args) - -> decltype(self.*member(BEST_FWD(args)...)) + -> decltype((self.*member)(BEST_FWD(args)...)) requires(std::is_function_v && sizeof...(Args) == 0) { return (BEST_FWD(self).*member)(BEST_FWD(args)...); } template -BEST_INLINE_SYNTHETIC constexpr auto call(F Class::*member, Class* self, +BEST_INLINE_SYNTHETIC constexpr auto call(F Class::*member, auto* self, auto&&... args) - -> decltype(self->*member(BEST_FWD(args)...)) + -> decltype((self->*member)(BEST_FWD(args)...)) requires(std::is_function_v && sizeof...(Args) == 0) { return (self->*member)(BEST_FWD(args)...); } template -BEST_INLINE_SYNTHETIC constexpr auto call(R Class::*member, Class&& self) +BEST_INLINE_SYNTHETIC constexpr auto call(R Class::*member, auto&& self) -> decltype(self.*member) requires(!std::is_function_v && sizeof...(Args) == 0) { return BEST_FWD(self).*member; } template -BEST_INLINE_SYNTHETIC constexpr auto call(R Class::*member, Class* self) +BEST_INLINE_SYNTHETIC constexpr auto call(R Class::*member, auto* self) -> decltype(self->*member) requires(!std::is_function_v && sizeof...(Args) == 0) { diff --git a/best/strings/encoding.h b/best/strings/encoding.h deleted file mode 100644 index 7f360ec..0000000 --- a/best/strings/encoding.h +++ /dev/null @@ -1,262 +0,0 @@ -#ifndef BEST_STRINGS_ENCODING_H_ -#define BEST_STRINGS_ENCODING_H_ - -#include -#include -#include -#include - -#include "best/container/option.h" -#include "best/container/span.h" -#include "best/meta/init.h" - -//! Text encodings. -//! -//! best::str and friends are encoding agnostic: they are always sequences of -//! Unicode characters, but that sequence may be encoded in more than one way. -//! This header provides concepts and types for working with encodings. -//! -//! See utf.h for examples of encodings. - -namespace best { -/// # `best::encoding` -/// -/// A text encoding. -/// -/// A text encoding is any type that fulfills the "Lucky 7" encoding API -/// from ztd.text. -/// -template > -concept encoding = requires(const E& e) { - /// Required type aliases. - typename E::code; - typename E::state; - - /// Required constants. - { E::MaxCodesPerRune } -> std::convertible_to; - - /// The state type must be constructible from const E&, as well - /// as copyable. - requires best::constructible || - best::constructible; - requires best::copyable; - - /// The encoding type itself must be equality comparable, as must its state. - requires std::equality_comparable; - requires std::equality_comparable; - - requires requires(typename E::state& state, - best::span& output, rune r) { - { e.write_rune(state, output, r) } -> std::convertible_to; - }; - - requires requires(typename E::state& state, - best::span& input) { - { e.read_rune(state, input) } -> std::convertible_to>; - }; -}; - -/// # `best::self_syncing_encoding` -/// -/// A self-synchronizing encoding, i.e., an encoding which can continue -/// encoding/decoding despite errors. -/// -/// An encoding can advertise it is self-syncing by defining a type alias -/// `self_syncing`. -template -concept self_syncing_encoding = - encoding && requires { typename E::self_syncing; }; - -/// # `best::stateless_encoding` -/// -/// A stateless encoding, which allows performing decoding operations are -/// arbitrary positions within a stream. -/// -/// This is determined by the encoding and its state being empty classes. -template -concept stateless_encoding = self_syncing_encoding && std::is_empty_v && - std::is_empty_v; - -/// # `best::string_type` -/// -/// A string type: a contiguous range that defines the `BestEncoding()` FTADLE -/// and whose data pointer matches that encoding. -template -concept string_type = best::contiguous && requires(const T& value) { - { BestEncoding(best::types, value) } -> best::encoding; - { - std::data(value) - } -> best::same, value))>::code*>; -}; - -/// # `best::get_encoding()` -/// -/// Extracts the encoding out of a string type. -constexpr const auto& get_encoding(const string_type auto& string) { - return BestEncoding(best::types, string); -} - -/// # `best::encoder` -/// -/// A stateful wrapper over some best::encoding for encoding/decoding from one -/// stream to another. -template -class encoder { - private: - /// Make `rune` a dependent type to make it useable in fwd-declared position - /// on constexpr functions. - template - using rune = std::conditional_t; - - public: - /// # `encoder::code` - /// - /// The code unit for this encoding. This is the element type of an encoded - /// stream. - using code = E::code; - - /// # `encoder::state` - /// - /// Any state necessary to save between indivisible decoding steps. - using state = E::state; - - /// # `encoder::MaxCodesPerRune` - /// - /// The maximum number of code units `write_rune()` will write. - static constexpr size_t MaxCodesPerRune = E::MaxCodesPerRune; - - /// # `encoder::is_self_syncing()` - /// - /// Whether this encoding is self-synchronizing. - static constexpr bool is_self_syncing() { return self_syncing_encoding; } - - /// # `encoder::is_stateless()` - /// - /// Whether this encoding is stateless, i.e., whether its state type is empty. - static constexpr bool is_stateless() { return stateless_encoding; } - - /// # `encoder::encoder()` - /// - /// Constructs the singleton encoder if this encoding is totally stateless. - constexpr encoder() - requires(is_stateless()); - - /// # `encoder::encoder(encoder)` - /// - /// Delegates copy/move to `E`. - constexpr encoder(const encoder&) = default; - constexpr encoder& operator=(const encoder&) = default; - constexpr encoder(encoder&&) = default; - constexpr encoder& operator=(encoder&&) = default; - - /// # `encoder::encoder(encoding)` - /// - /// Constructs a new encoder for the given encoding. - constexpr explicit encoder(const E& encoding) - requires best::constructible && - (!best::constructible) - : encoding_(std::addressof(encoding)), state_() {} - - constexpr explicit encoder(const E& encoding) - requires best::constructible - : encoding_(std::addressof(encoding)), state_(encoding) {} - - /// # `encoder::validate()` - /// - /// Validates whether a span of code units is correctly encoded. - static constexpr bool validate(const E& e, best::span input) { - encoder enc(e); - - while (!input.is_empty()) { - if (enc.read_rune(&input).is_empty()) { - return false; - } - } - - return true; - } - - /// # `encoder::size()` - /// - /// Computes the would-be-encoded size from calling write_rune(). - constexpr best::option size(rune<> rune) const { - code buf[MaxCodesPerRune]; - - auto copy = *this; - if (auto encoded = copy.write_rune(buf, rune)) { - return encoded->size(); - } - return best::none; - } - - /// # `encoder::write_rune()` - /// - /// Performs a single indivisible encoding operation. - /// - /// Returns the part of `output` written to. If `output` is passed by pointer - /// rather than by value, it is automatically advanced. - /// - /// Returns `best::none` on failure; in this case, `output` is not advanced. - constexpr best::option> write_rune(best::span* output, - rune<> rune) { - auto out0 = *output; - - if (encoding_->write_rune(state_, *output, rune)) { - size_t written = out0.size() - output->size(); - return out0[{.count = written}]; - } - - *output = out0; - return best::none; - } - constexpr best::option> write_rune(best::span output, - rune<> rune) { - return write_rune(&output, rune); - } - - /// # `encoder::read_rune()` - /// - /// Performs a single indivisible decoding operation. - /// - /// Returns the decoded rune. If `input` is passed by pointer rather than by - /// value, it is automatically advanced. - /// - /// Returns `best::none` on failure; in this case, `input` is not advanced. - constexpr best::option> read_rune(best::span* input) { - auto in0 = *input; - - if (auto encoded = encoding_->read_rune(state_, *input)) { - return encoded; - } - - *input = in0; - return best::none; - } - constexpr best::option> read_rune(best::span input) { - return read_rune(&input); - } - - constexpr bool operator==(const encoder& it) const = default; - - private: - const E* encoding_; - [[no_unique_address]] state state_; -}; - -template -encoder(const E&) -> encoder; - -namespace encoding_internal { -template -inline constexpr E singleton; -} // namespace encoding_internal - -template -constexpr encoder::encoder() - requires(is_stateless()) - : encoder(encoding_internal::singleton) {} - -} // namespace best - -#endif // BEST_STRINGS_ENCODING_H_ \ No newline at end of file diff --git a/best/strings/rune.h b/best/strings/rune.h deleted file mode 100644 index 77b79f9..0000000 --- a/best/strings/rune.h +++ /dev/null @@ -1,142 +0,0 @@ -#ifndef BEST_STRINGS_RUNE_H_ -#define BEST_STRINGS_RUNE_H_ - -#include -#include - -#include "best/container/option.h" -#include "best/container/span.h" -#include "best/strings/encoding.h" -#include "best/strings/internal/utf.h" - -//! Unicode characters. -//! -//! best::rune is a Unicode character type, specifically, a Unicode Scalar -//! Value[1]. It is the entry-point to best's Unicode library. -//! -//! [1]: https://www.unicode.org/glossary/#unicode_scalar_value - -namespace best { -/// # `best::rune` -/// -/// A Unicode scalar value, called a "rune" in the p9 tradition. -/// -/// This value corresponds to a valid Unicode scalar value, which may -/// potentially be an unpaired surrogate. This is to allow encodings that allow -/// unpaired surrogates, such as WTF-8, to produce best::runes. -class rune final { - private: - static constexpr bool in_range(uint32_t value) { return value < 0x11'0000; } - static constexpr bool is_surrogate(uint32_t value) { - return value >= 0xd800 && value < 0xe000; - } - - public: - /// # `rune::replacement()` - /// - /// Returns the Unicode replacement character. - static constexpr rune replacement() { return 0xfffd; } - - /// # `rune::rune()` - /// - /// Creates a new rune corresponding to NUL. - constexpr rune() = default; - - /// # `rune::rune(rune)` - /// - /// Trivially copyable. - constexpr rune(const rune&) = default; - constexpr rune& operator=(const rune&) = default; - constexpr rune(rune&&) = default; - constexpr rune& operator=(rune&&) = default; - - /// # `rune::rune(int)` - /// - /// Creates a new rune from an integer. - /// - /// The integer must be a constant, and it must be a valid Unicode scalar - /// value, and *not* an unpaired surrogate. - constexpr rune(uint32_t value) BEST_ENABLE_IF_CONSTEXPR(value) - BEST_ENABLE_IF(in_range(value) && !is_surrogate(value), - "rune value not within the valid Unicode range") - : value_(value) {} - - /// # `rune::from_int()` - /// - /// Parses a rune from an integer. - /// Returns `best::none` if this integer is not in the Unicode scalar value - /// range. - constexpr static best::option from_int(uint32_t value) { - if (!in_range(value) || is_surrogate(value)) return best::none; - return rune(best::in_place, value); - } - constexpr static best::option from_int(int32_t value) { - return from_int(static_cast(value)); - } - - /// # `rune::from_int_allow_surrogates()` - /// - /// Like `rune::from_int()`, but allows unpaired surrogates. - constexpr static best::option from_int_allow_surrogates( - uint32_t value) { - if (!in_range(value)) return best::none; - return rune(best::in_place, value); - } - constexpr static best::option from_int_allow_surrogates(int32_t value) { - return from_int_allow_surrogates(static_cast(value)); - } - - /// # `rune::to_int()` - /// - /// Converts this rune into the underlying 32-bit integer. - constexpr uint32_t to_int() const { return value_; } - constexpr operator uint32_t() const { return value_; } - - /// # `rune::is_unpaired_surrogate()` - /// - /// Returns whether this rune is an unpaired surrogate. - constexpr bool is_unpaired_surrogate() const { return is_surrogate(value_); } - - /// # `rune::is_low_surrogate()` - /// - /// Returns whether this rune is a "low" unpaired surrogate. - constexpr bool is_low_surrogate() const { - return is_unpaired_surrogate() && value_ >= 0xdc00; - } - - /// # `rune::is_high_surrogate()` - /// - /// Returns whether this rune is an "high" unpaired surrogate. - constexpr bool is_high_surrogate() const { - return is_unpaired_surrogate() && value_ < 0xdc00; - } - - /// # `rune::size()`. - /// - /// Returns the size of this rune in the given encoding. - template - constexpr size_t size() const { - return E::size(*this); - } - - /// Tempoaray hack until BestFmt. - template - friend Os& operator<<(Os& os, rune r) { - char encoded[5] = {}; - best::utf_internal::encode8(encoded, r); - - return os << encoded << "/" << std::hex << r.to_int(); - } - - // best::rune has a niche representation. - constexpr rune(niche) : value_(0x11'0000) {} - constexpr bool operator==(niche) const { return value_ == 0x11'0000; } - - private: - constexpr explicit rune(best::in_place_t, uint32_t value) : value_(value) {} - - uint32_t value_; -}; -} // namespace best - -#endif // BEST_STRINGS_RUNE_H_ \ No newline at end of file diff --git a/best/strings/str.h b/best/strings/str.h deleted file mode 100644 index 9ca3890..0000000 --- a/best/strings/str.h +++ /dev/null @@ -1,306 +0,0 @@ -#ifndef BEST_STRINGS_STR_H_ -#define BEST_STRINGS_STR_H_ - -#include -#include -#include -#include - -#include "best/container/span.h" -#include "best/meta/init.h" -#include "best/strings/encoding.h" -#include "best/strings/utf.h" - -//! Unicode strings. -//! -//! best::encoded is a Unicode string, i.e., an encoded sequence of best::runes. -//! It is essentially std::basic_string_view with a nicer API (compare with -//! best::span). -//! -//! best::str, best::str16, and best::str32 are type aliases corresponding to -//! the UTF-8/16/32 specializations of the above. - -namespace best { -template -class encoded; - -/// # `best::str` -/// -/// A reference to UTF-8 text data. -using str = best::encoded; - -/// # `best::str16` -/// -/// A reference to UTF-16 text data. -using str16 = best::encoded; - -/// # `best::str32` -/// -/// A reference to UTF-32 text data. -using str32 = best::encoded; - -/// # `best::encoded` -/// -/// An reference to contiguous textual data. -/// -/// This is a generalized view that allows specifying the encoding of the -/// underlying data. It is similar to `std::basic_string_view`, except it uses -/// a ztd.text-style encoding trait, and provides a generally nicer interface. -/// -/// A `best::encoded` string can be created from a string literal; in this case, -/// it will be validated for being "correctly encoded" wrt to the encoding `E`. -/// It can also be constructed from a pointer, in which case no such check -/// occurs. -/// -/// A `best::encoded` may point to invalidly-encoded data. If the encoding is -/// self-synchronizing, the stream of Unicode characters is interpreted as -/// replacing each invalid code unit with a Unicode replacement character -/// (U+FFFD). If the encoding is not self-synchronizing, the stream is -/// interpreted to end at that position, with a replacement character. The -/// `runes()` iterator performs this decoding operation. -template -class encoded final { - public: - /// # `encoded::encoding` - /// - /// The encoding for this string. - using encoding = E; - - /// # `encoded::code` - /// - /// The code unit for this encoding. This is the element type of an encoded - /// stream. - using code = encoding::code; - - /// # `encoded::encoded()` - /// - /// Creates a new, empty string. - constexpr encoded() - requires best::constructible - : encoded(nullptr, 0, encoding{}) {} - - /// # `encoded::encoded(enc)` - /// - /// Creates a new, empty string with the given encoding. - constexpr explicit encoded(encoding enc) - : encoded(nullptr, 0, std::move(enc)) {} - - /// # `encoded::encoded(encoded)` - /// - /// Copyable and movable. - constexpr encoded(const encoded&) = default; - constexpr encoded& operator=(const encoded&) = default; - constexpr encoded(encoded&&) = default; - constexpr encoded& operator=(encoded&&) = default; - - /// # `encoded::encoded("...")` - /// - /// Creates a new string from a string literal. - /// The array must be a constant, and it must contain validly-encoded data. - template - constexpr encoded(const code (&lit)[n]) - requires best::constructible - BEST_ENABLE_IF_CONSTEXPR(lit) - BEST_ENABLE_IF(best::encoder::validate(encoding{}, - best::span(lit, n - 1)), - "string must be validly encoded") - : encoded(lit, n - 1) {} - - /// # `encoded::encoded("...", enc)` - /// - /// Creates a new string from a string literal with an explicit encoding. - /// The array must be a constant, and it must contain validly-encoded data. - template - constexpr encoded(const code (&lit)[n], encoding enc) - BEST_ENABLE_IF_CONSTEXPR(lit) BEST_ENABLE_IF_CONSTEXPR(enc) - BEST_ENABLE_IF( - best::encoder::validate(enc, best::span(lit, n - 1)), - "string must be validly encoded") - : encoded(lit, n - 1, std::move(enc)) {} - - /// # `encoded::encoded(ptr)` - /// - /// Creates a new string from a possibly-null, NUL-terminated pointer. - constexpr explicit encoded(const code* data) - requires best::constructible - : encoded(data, encoding()) {} - constexpr encoded(std::nullptr_t) - requires best::constructible - : encoded(nullptr, 0, encoding{}) {} - - /// # `encoded::encoded(ptr, enc)` - /// - /// Creates a new string from a possibly-null, NUL-terminated pointer, - /// with an explicit encoding. - constexpr explicit encoded(const code* data, encoding enc) - : encoded(std::move(enc)) { - if (data != nullptr) { - auto ptr = data; - while (*ptr++) - ; - - *this = encoded(data, ptr - data - 1, std::move(encoding_)); - } - } - constexpr encoded(std::nullptr_t, encoding enc) - : encoded(nullptr, 0, std::move(enc)) {} - - /// # `encoded::encoded(ptr, len)` - /// - /// Creates a new string from the given data and length. - constexpr encoded(const code* data, size_t size) - requires best::constructible - : encoded(data, size, encoding()) {} - - /// # `encoded::encoded(ptr, len)` - /// - /// Creates a new string from the given data and length, with an explicit - /// encoding. - constexpr encoded(const code* data, size_t size, encoding enc) - : span_{data == nullptr ? &empty : data, size}, - encoding_{std::move(enc)} {} - - /// # `encoded::encoded(str)` - /// - /// Creates a new string from some other `best::string_type`. - template - constexpr encoded(const Str& that) - requires(!std::is_array_v && !std::is_pointer_v) - : encoded(std::data(that), std::size(that), best::get_encoding(that)) {} - - /// # `encoded::from()` - /// - /// Creates a new string by parsing it from a span of potentially invalid - /// characters. - constexpr best::option from(best::span data, - encoding enc = {}) { - if (!best::encoder::validate(enc, data)) { - return best::none; - } - - return encoded(best::in_place, data, std::move(enc)); - } - - /// # `encoded::size()` - /// - /// Returns the size of the string, in code units. - constexpr size_t size() const { return span_.size(); } - - /// # `encoded::data()` - /// - /// Returns the string's data pointer. - /// This value is never null. - constexpr const code* data() const { return span_.data(); } - - /// # `encoded::get_encoding()` - /// - /// Returns the underlying text encoding. - constexpr const encoding& get_encoding() const { return encoding_; } - - /// # `encoded::is_empty()` - /// - /// Checks whether the string is empty. - constexpr bool is_empty() const { return size() == 0; } - - /// # `encoded[{...}]` - /// - /// Gets the substring at the given index. Crashes on out-of-bounds access. - constexpr encoded operator[](best::bounds::with_location range) const { - return span_[range]; - } - - /// # `encoded::rune_iter`, `encoded::runes()`. - /// - /// An iterator over the runes of a best::encoded. - /// - /// A `best::encoded` may point to invalidly-encoded data. If the encoding is - /// self-synchronizing, the stream of Unicode characters is interpreted as - /// replacing each invalid code unit with a Unicode replacement character - /// (U+FFFD). If the encoding is not self-synchronizing, the stream is - /// interpreted to end at that position, with a replacement character. - struct rune_iter; - constexpr rune_iter runes() const { return {*this}; } - - // TODO: BestFmt - template - friend Os& operator<<(Os& os, encoded str) { - if constexpr (best::same) { - return os << std::string_view(str.data(), str.size()); - } - - char u8[utf8::MaxCodesPerRune]; - for (rune r : str.runes()) { - if (auto chars = best::encoder().write_rune(u8, r)) { - os << std::string_view(chars->data(), chars->size()); - continue; - } - } - } - - bool operator==(const encoded&) const = default; - bool operator==(const best::span& span) const { - return span_ == span; - } - template - bool operator==(const code (&lit)[n]) const { - return span_ == best::span(lit, n - 1); - } - - private: - constexpr explicit encoded(best::in_place_t, best::span span, - encoding enc) - : span_(span), encoding_(std::move(enc)) {} - - best::span span_{&empty, 0}; - [[no_unique_address]] encoding encoding_; - - static constexpr code empty{}; -}; - -template -struct encoded::rune_iter final { - public: - constexpr bool operator==(const rune_iter& it) const = default; - constexpr rune operator*() const { return next_.value_or(); } - - constexpr rune_iter& operator++() { - if (str_.is_empty()) { - next_ = best::none; - return *this; - } - - next_ = state_.read_rune(&str_); - if (!next_.has_value()) { - next_ = rune::replacement(); - if (best::self_syncing_encoding) { - str_ = str_[{.start = 1}]; - } else { - str_ = {nullptr, 0}; - } - } - - return *this; - } - - constexpr rune_iter operator++(int) { - auto prev = *this; - ++*this; - return prev; - } - - constexpr rune_iter begin() const { return *this; } - constexpr rune_iter end() const { return rune_iter(state_); } - - private: - friend encoded; - rune_iter(encoded str) : str_(str), state_(str.get_encoding()) { ++*this; } - rune_iter(best::encoder enc) : state_(std::move(enc)) {} - - best::option next_; - best::span str_; - best::encoder state_; -}; - -} // namespace best - -#endif // BEST_STRINGS_STR_H_ \ No newline at end of file diff --git a/best/strings/str_test.cc b/best/strings/str_test.cc deleted file mode 100644 index 8c0f6ff..0000000 --- a/best/strings/str_test.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include "best/strings/str.h" - -#include "best/container/vec.h" -#include "best/test/test.h" - -namespace best::str_test { - -best::test Empty = [](auto& t) { - best::str s1; - t.expect_eq(s1, ""); - t.expect_eq(s1, nullptr); - t.expect_eq(s1.size(), 0); - t.expect(s1.is_empty()); - - best::str s2 = ""; - t.expect_eq(s2, ""); - t.expect_eq(s2, nullptr); - t.expect_eq(s2.size(), 0); - t.expect(s2.is_empty()); - - best::str s3 = nullptr; - t.expect_eq(s3, ""); - t.expect_eq(s3, nullptr); - t.expect_eq(s3.size(), 0); - t.expect(s3.is_empty()); - - best::str s4 = best::str(static_cast("")); - t.expect_eq(s4, ""); - t.expect_eq(s4, nullptr); - t.expect_eq(s4.size(), 0); - t.expect(s4.is_empty()); - - best::str s5 = best::str(nullptr); - t.expect_eq(s5, ""); - t.expect_eq(s5, nullptr); - t.expect_eq(s5.size(), 0); - t.expect(s5.is_empty()); -}; - -best::test Size = [](auto& t) { - best::str s = "foo"; - t.expect_eq(s.size(), 3); - t.expect(!s.is_empty()); - - best::str s2 = "foo\0foo"; - t.expect_eq(s2.size(), 7); -}; - -best::test Utf8Decode = [](auto& t) { - best::str test = "solomonπŸ§ΆπŸˆβ€β¬›ι»’ηŒ«"; - best::vec runes; - for (rune r : test.runes()) { - runes.push(r); - } - - t.expect_eq(runes, - best::span{'s', 'o', 'l', 'o', 'm', 'o', 'n', U'🧢', - U'🐈', 0x200d, U'⬛', U'ι»’', U'猫'}); -}; - -best::test Utf16Decode = [](auto& t) { - best::str16 test = u"solomonπŸ§ΆπŸˆβ€β¬›ι»’ηŒ«"; - best::vec runes; - for (rune r : test.runes()) { - runes.push(r); - } - - t.expect_eq(runes, - best::span{'s', 'o', 'l', 'o', 'm', 'o', 'n', U'🧢', - U'🐈', 0x200d, U'⬛', U'ι»’', U'猫'}); -}; -} // namespace best::str_test \ No newline at end of file diff --git a/best/strings/utf.h b/best/strings/utf.h deleted file mode 100644 index 399dc81..0000000 --- a/best/strings/utf.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef BEST_STRINGS_UTF_H_ -#define BEST_STRINGS_UTF_H_ - -#include - -#include "best/container/option.h" -#include "best/container/span.h" -#include "best/strings/internal/utf.h" -#include "best/strings/rune.h" - -//! Encodings for the "Unicode Transformation Formats". -//! -//! This header defines the UTF-8, UTF-16, UTF-32, and WTF-8 encodings. - -namespace best { -/// # `best::utf8` -/// -/// A `best::encoding` representing UTF-8. -struct utf8 final { - using code = char; // Not char8_t because the standard messed up. - using state = utf8; - using self_syncing = void; - - static constexpr size_t MaxCodesPerRune = 4; - - static constexpr best::option read_rune(utf8, - best::span& input) { - if (auto result = best::utf_internal::decode8(input)) { - input = input[{.start = result->first}]; - return rune::from_int(result->second); - } - return best::none; - } - - static constexpr bool write_rune(utf8, best::span& output, rune rune) { - if (auto result = best::utf_internal::encode8(output, rune)) { - output = output[{.start = *result}]; - return true; - } - return false; - } - - constexpr bool operator==(const utf8&) const = default; -}; - -/// # `best::wtf8` -/// -/// A best::encoding representing WTF-8 (Wobbly Transformation Format). -/// -/// Its only difference with UTF-8 is that it allows decoded runes to be -/// unpaired surrogates (in the range U+D800 to U+DFFF). -struct wtf8 final { - using code = char; // Not char8_t because the standard messed up. - using state = utf8; - using self_syncing = void; - - static constexpr size_t MaxCodesPerRune = 4; - - static constexpr best::option read_rune(wtf8, - best::span& input) { - if (auto result = best::utf_internal::decode8(input)) { - input = input[{.start = result->first}]; - return rune::from_int_allow_surrogates(result->second); - } - return best::none; - } - - static constexpr bool write_rune(wtf8, best::span& output, rune rune) { - return utf8::write_rune({}, output, rune); - } - - constexpr bool operator==(const wtf8&) const = default; -}; - -/// # `best::utf16` -/// -/// A best::encoding representing UTF-16. -struct utf16 final { - using code = char16_t; - using state = utf16; - using self_syncing = void; - - static constexpr size_t MaxCodesPerRune = 2; - - static constexpr best::option read_rune( - utf16, best::span& input) { - if (auto result = best::utf_internal::decode16(input)) { - input = input[{.start = result->first}]; - return rune::from_int(result->second); - } - return best::none; - } - - static constexpr bool write_rune(utf16, best::span& output, - rune rune) { - if (auto result = best::utf_internal::encode16(output, rune)) { - output = output[{.start = *result}]; - return true; - } - return false; - } - - bool operator==(const utf16&) const = default; -}; - -/// # `best::utf32` -/// -/// A best::encoding representing UTF-32. -struct utf32 final { - using code = char32_t; - using state = utf32; - using self_syncing = void; - - static constexpr size_t MaxCodesPerRune = 1; - - static constexpr best::option read_rune( - utf32, best::span& input) { - if (auto next = input.take_first(1)) { - return rune::from_int((*next)[0]); - } - return best::none; - } - - static constexpr bool write_rune(utf32, best::span& output, - rune rune) { - if (auto next = output.take_first(1)) { - (*next)[0] = rune; - return true; - } - return false; - } - - constexpr bool operator==(const utf32&) const = default; -}; - -constexpr const utf8& BestEncoding(auto, const std::string&) { - return best::val::value; -} -constexpr const utf8& BestEncoding(auto, const std::string_view&) { - return best::val::value; -} -} // namespace best - -#endif // BEST_STRINGS_UTF_H_ \ No newline at end of file diff --git a/best/test/BUILD b/best/test/BUILD index da97172..7701140 100644 --- a/best/test/BUILD +++ b/best/test/BUILD @@ -7,7 +7,7 @@ cc_library( testonly = True, deps = [ ":test", - "//best/strings:str", + "//best/text:str", "//best/log:location", ], ) @@ -19,7 +19,7 @@ cc_library( testonly = True, deps = [ "//best/container:vec", - "//best/strings:str", + "//best/text:str", "//best/log:location", ], ) diff --git a/best/test/fodder.cc b/best/test/fodder.cc index bfac5dd..c4ca07b 100644 --- a/best/test/fodder.cc +++ b/best/test/fodder.cc @@ -58,16 +58,17 @@ LeakTest::~LeakTest() { destroyed_.erase(token); if (diff > 0) { - t_->fail(std::format("unexpected extra {} free(s) of #{}", diff, token)); + t_->fail(*best::str::from( + std::format("unexpected extra {} free(s) of #{}", diff, token))); } else if (diff < 0) { - t_->fail( - std::format("unexpected missing {} free(s) of #{}", -diff, token)); + t_->fail(*best::str::from( + std::format("unexpected missing {} free(s) of #{}", -diff, token))); } } for (auto [token, destroyed] : destroyed_) { - t_->fail(std::format("unexpected {} free(s) of uncreated #{}", destroyed, - token)); + t_->fail(*best::str::from(std::format( + "unexpected {} free(s) of uncreated #{}", destroyed, token))); } } } // namespace best_fodder \ No newline at end of file diff --git a/best/test/test.cc b/best/test/test.cc index e6d7d2d..40bff20 100644 --- a/best/test/test.cc +++ b/best/test/test.cc @@ -20,7 +20,7 @@ best::str symbol_name(const void* ptr, best::location loc) { << "; you may need to pass -rdynamic as part of your link options\n"; std::exit(128); } - return best::str(di.dli_sname); + return *best::str::from_nul(di.dli_sname); } else { std::cerr << "could not parse symbol name for test at " << loc << "; it might not be a global variable?\n"; diff --git a/best/test/test.h b/best/test/test.h index 0054666..1855452 100644 --- a/best/test/test.h +++ b/best/test/test.h @@ -8,7 +8,7 @@ #include "best/log/location.h" #include "best/meta/ops.h" -#include "best/strings/str.h" +#include "best/text/str.h" //! The best unit testing library. diff --git a/best/strings/BUILD b/best/text/BUILD similarity index 72% rename from best/strings/BUILD rename to best/text/BUILD index 12173da..42782b8 100644 --- a/best/strings/BUILD +++ b/best/text/BUILD @@ -1,37 +1,44 @@ package(default_visibility = ["//visibility:public"]) -cc_library( - name = "encoding", - hdrs = ["encoding.h"], - deps = [ - "//best/container:option", - "//best/container:span", - ] -) - cc_library( name = "rune", hdrs = ["rune.h"], deps = [ - ":encoding", ":utf_internal", "//best/log/internal:crash", "//best/container:span", ] ) +cc_test( + name = "rune_test", + srcs = ["rune_test.cc"], + linkopts = ["-rdynamic"], + deps = [ + ":rune", + "//best/test", + ] +) + cc_library( name = "utf", - hdrs = [ - "utf.h", - ], + hdrs = ["utf.h"], deps = [ - ":encoding", ":rune", ":utf_internal", ] ) +cc_test( + name = "utf_test", + srcs = ["utf_test.cc"], + linkopts = ["-rdynamic"], + deps = [ + ":utf", + "//best/test", + ] +) + cc_library( name = "str", hdrs = ["str.h"], diff --git a/best/strings/internal/utf.h b/best/text/internal/utf.h similarity index 57% rename from best/strings/internal/utf.h rename to best/text/internal/utf.h index cc36cd3..b413487 100644 --- a/best/strings/internal/utf.h +++ b/best/text/internal/utf.h @@ -1,19 +1,32 @@ -#ifndef BEST_STRINGS_INTERNAL_UTF_H_ -#define BEST_STRINGS_INTERNAL_UTF_H_ +#ifndef BEST_TEXT_INTERNAL_UTF_H_ +#define BEST_TEXT_INTERNAL_UTF_H_ #include #include #include "best/container/option.h" #include "best/container/span.h" +#include "best/math/bit.h" //! Low-level UTF encode/decode routines. namespace best::utf_internal { +constexpr size_t size8(uint32_t rune) { + if (rune < 0x80) { + return 1; + } else if (rune < 0x800) { + return 2; + } else if (rune < 0x10000) { + return 3; + } else { + return 4; + } +} + constexpr best::option> decode8( best::span input) { - auto first = input.at(0); - if (first.is_empty()) return best::none; + auto first = input.first(); + if (!first) return best::none; size_t bytes = 0; uint32_t value = static_cast(*first); @@ -38,7 +51,7 @@ constexpr best::option> decode8( } auto rest = input.at({.start = 1, .count = bytes - 1}); - if (!rest.has_value()) return best::none; + if (!rest) return best::none; for (uint8_t c : *rest) { if (std::countl_one(c) != 1) return best::none; @@ -47,43 +60,48 @@ constexpr best::option> decode8( value |= c & 0b00'111111; } + // Reject oversized encodings. + if (bytes != size8(value)) return best::none; + return {{bytes, value}}; } -constexpr best::option encode8(best::span output, uint32_t rune) { - size_t bytes = 0; - if (rune < 0x80) { - bytes = 1; - } else if (rune < 0x800) { - bytes = 2; - } else if (rune < 0x10000) { - bytes = 3; - } else { - bytes = 4; +constexpr best::option> undecode8( + best::span input) { + size_t len = 0; + for (; len < 4; ++len) { + auto next = input.at(input.size() - len - 1); + if (!next) return best::none; + + if (best::leading_ones(*next) != 1) { + break; + } } + if (len == 4) return best::none; + + auto result = decode8(input[{.start = input.size() - len - 1}]); + if (!result || result->first != len) return best::none; + return result; +} + +constexpr best::option encode8(best::span output, uint32_t rune) { + size_t bytes = size8(rune); if (output.size() < bytes) return false; for (size_t i = bytes; i > 1; --i) { - output[i - 1] = (rune & 0xb111111) | 0b10'000000; + output[i - 1] = (rune & 0b0011'1111) | 0b1000'0000; rune >>= 6; } - switch (bytes) { - case 1: - output[0] = rune & 0b0'1111111; - break; - case 2: - output[0] = (rune & 0b000'11111) | 0b110'00000; - break; - case 3: - output[0] = (rune & 0b0000'1111) | 0b1110'0000; - break; - case 4: - output[0] = (rune & 0b00000'111) | 0b11110'000; - break; - } + constexpr std::array, 4> Masks = {{ + {0b0111'1111, 0b0000'0000}, + {0b0001'1111, 0b1100'0000}, + {0b0000'1111, 0b1110'0000}, + {0b0000'0111, 0b1111'0000}, + }}; + output[0] = (rune & Masks[bytes - 1][0]) | Masks[bytes - 1][1]; return bytes; } @@ -97,8 +115,8 @@ inline constexpr uint32_t Max = 0xe000; constexpr best::option> decode16( best::span input) { - auto hi = input.at(0); - if (hi.is_empty()) return best::none; + auto hi = input.first(); + if (!hi) return best::none; if (hi < High || hi >= Max) { return {{1, *hi}}; @@ -113,6 +131,20 @@ constexpr best::option> decode16( return {{2, value + 0x10000}}; } +constexpr best::option> undecode16( + best::span input) { + auto hi = input.first(); + if (!hi) return best::none; + + auto is_surrogate = *hi >= High && *hi < Max; + auto len = is_surrogate ? 2 : 1; + if (input.size() < len) return best::none; + + auto result = decode16(input[{.start = input.size() - len}]); + if (!result || result->first != len) return best::none; + return result; +} + constexpr best::option encode16(best::span output, uint32_t rune) { if (rune < 0x10000 && output.size() >= 1) { @@ -129,4 +161,4 @@ constexpr best::option encode16(best::span output, } } // namespace best::utf_internal -#endif // BEST_STRINGS_INTERNAL_UTF_H_ \ No newline at end of file +#endif // BEST_TEXT_INTERNAL_UTF_H_ \ No newline at end of file diff --git a/best/text/rune.h b/best/text/rune.h new file mode 100644 index 0000000..6a4e615 --- /dev/null +++ b/best/text/rune.h @@ -0,0 +1,621 @@ +#ifndef BEST_TEXT_RUNE_H_ +#define BEST_TEXT_RUNE_H_ + +#include +#include +#include + +#include "best/base/hint.h" +#include "best/container/option.h" +#include "best/container/span.h" +#include "best/log/internal/crash.h" +#include "best/text/internal/utf.h" + +//! Unicode characters and encodings. +//! +//! best::rune is a Unicode character type, specifically, a Unicode Scalar +//! Value[1]. It is the entry-point to best's Unicode library. +//! +//! [1]: https://www.unicode.org/glossary/#unicode_scalar_value + +namespace best { +/// # `best::encoding_about` +/// +/// Details about an encoding. Every encoding must provide a `constexpr` member +/// named `About` of this type. +/// +/// In the future, this requirement may be relaxed for an encoding to provide +/// a dynamic value for this type. +struct encoding_about final { + /// The maximum number of codes `write_rune()` can write. Must be positive. + size_t max_codes_per_rune = 0; + + /// Whether this encoding is self-synchronizing. + /// + /// A self-synchronizing encoding is one where attempting to decode a rune + /// using a suffix of an encoded rune is detectable as an error without + /// context. + /// + /// For example, x86 machine code is not self-synchronizing, because jumping + /// into the middle of an instruction may decode a different, valid + /// instruction. UTF-8, UTF-16, and UTF-32 are self-synchronizing. + /// + /// We assume that self-synchronizing encodings are stateless. It is possible + /// to construct a non-synchronizing, stateful encoding, but those don't + /// really occur in practice because being self-synchronizing is an extremely + /// strong property. + /// + /// Many string algorithms are only available for self-synchronizing + /// encodings. https://en.wikipedia.org/wiki/Self-synchronizing_code + bool is_self_syncing = false; + + /// Whether encoded runes are lexicographic. + /// + /// An encoding has the lexicographic property if, given two sequences of + /// runes `r1` and `r2`, and their corresponding encoded code sequences + /// `c1` and `c2`, then `r1 <=> r2 == c1 <=> c2`, as `best::span`s. + /// + /// UTF-8 and UTF-32 have this property. UTF-16 does not, because runes + /// greater than U+FFFF are encoded with a pair of surrogates, both of which + /// start with hex digit `0xd`; this means that `U+FFFF > U+10000` when + /// encoded as UTF-16. + bool is_lexicographic = false; +}; + +/// # `best::encoding` +/// +/// A text encoding type. This type usually won't be used on its own; instead, +/// helpers from `best::rune` should be used instead. +/// +/// A text encoding is any type that fulfills a contract in the spirit of the +/// "Lucky 7" encoding API from ztd.text. +/// +/// +/// To be supported by `best`, an encoding must be: +/// +/// * Stateless. Decoding any one rune may not depend on what runes came +/// before it. +/// * Reversible. At any position within a stream, assuming it is a rune +/// boundary, it is possible to decode a unique rune in reverse order, and +/// reverse decoding agrees with forward decoding. +/// * Injective. Every rune is encoded as exactly one sequence of one or more +/// code units. +template > +concept encoding = + best::copyable && best::equatable && requires(const E& e) { + /// Required type aliases. + typename E::code; + + /// Required constants. + { E::About } -> std::convertible_to; + + /// It must provide the following operations. `best::rune` provides + /// wrappers for them, which specifies what each of these functions must + /// do. + requires requires(size_t idx, rune r, + best::span& input, + best::span& output) { + { e.is_boundary(input, idx) } -> std::convertible_to; + { e.encode(&output, r) } -> std::convertible_to; + { e.decode(&input) } -> std::convertible_to>; + { e.undecode(&input) } -> std::convertible_to>; + }; + }; + +/// # `best::code` +/// +/// The code unit type of a particular encoding. +template +using code = E::code; + +/// # `best::string_type` +/// +/// A string type: a contiguous range that defines the `BestEncoding()` FTADLE +/// and whose data pointer matches that encoding. +template +concept string_type = best::contiguous && requires(const T& value) { + { BestEncoding(best::types, value) } -> best::encoding; + { + std::data(value) + } -> best::same, value))>::code*>; +}; + +/// # `best::encoding_of()` +/// +/// Extracts the encoding out of a string type. +constexpr const auto& encoding_of(const string_type auto& string) { + return BestEncoding(best::types, string); +} + +/// # `best::same_encoding()` +/// +/// Returns whether two string values have the same encoding. This verifies that +/// their encodings compare as equal. +constexpr bool same_encoding(const string_type auto& lhs, + const string_type auto& rhs) { + using E1 = std::remove_cvref_t; + using E2 = std::remove_cvref_t; + + if constexpr (best::equatable) { + return best::encoding_of(lhs) == best::encoding_of(rhs); + } + + return false; +} + +/// # `best::rune` +/// +/// A Unicode scalar value, called a "rune" in the p9 tradition. +/// +/// this rune corresponds to a valid Unicode scalar value, which may +/// potentially be an unpaired surrogate. This is to allow encodings that allow +/// unpaired surrogates, such as WTF-8, to produce best::runes. +class rune final { + private: + static constexpr bool is_unicode(uint32_t value) { return value < 0x11'0000; } + static constexpr bool is_surrogate(uint32_t value) { + return value >= 0xd800 && value < 0xe000; + } + + public: + /// # `rune::replacement()` + /// + /// Returns the Unicode replacement character. + static constexpr rune replacement() { return 0xfffd; } + + /// # `rune::rune()` + /// + /// Creates a new rune corresponding to NUL. + constexpr rune() = default; + + /// # `rune::rune(rune)` + /// + /// Trivially copyable. + constexpr rune(const rune&) = default; + constexpr rune& operator=(const rune&) = default; + constexpr rune(rune&&) = default; + constexpr rune& operator=(rune&&) = default; + + /// # `rune::rune(int)` + /// + /// Creates a new rune from an integer. + /// + /// The integer must be a constant, and it must be a valid Unicode scalar + /// value, and *not* an unpaired surrogate. + constexpr rune(uint32_t value) BEST_ENABLE_IF_CONSTEXPR(value) + BEST_ENABLE_IF(is_unicode(value) && !is_surrogate(value), + "rune value not within the valid Unicode range") + : value_(value) {} + + /// # `rune::from_int()` + /// + /// Parses a rune from an integer. + /// Returns `best::none` if this integer is not in the Unicode scalar value + /// range. + constexpr static best::option from_int(uint32_t); + constexpr static best::option from_int(int32_t); + + /// # `rune::from_int_allow_surrogates()` + /// + /// Like `rune::from_int()`, but allows unpaired surrogates. + constexpr static best::option from_int_allow_surrogates(uint32_t); + constexpr static best::option from_int_allow_surrogates(int32_t); + + /// # `rune::to_int()` + /// + /// Converts this rune into the underlying 32-bit integer. + constexpr uint32_t to_int() const { return value_; } + constexpr operator uint32_t() const { return value_; } + + /// # `rune::validate()` + /// + /// Validates whether a span of code units is correctly encoded per `E`. + template + constexpr static bool validate(best::span> input, + const E& enc = {}) { + while (!input.is_empty()) { + if (!decode(&input, enc)) return false; + } + return true; + } + + /// # `rune::size()` + /// + /// Returns the number of code units needed to encode this rune. Returns + /// `best::none` if this rune is not encodable with `E`. + template + constexpr best::option size(const E& = {}) const; + + /// # `rune::is_boundary()` + /// + /// Returns whether the code unit boundary given by `idx` is also a rune + /// boundary. + template + constexpr static bool is_boundary(best::span> input, size_t idx, + const E& enc = {}) { + return enc.is_boundary(input, idx); + } + + /// # `rune::encode()` + /// + /// Performs a single indivisible encoding operation. + /// + /// Returns the part of `output` written to. If `output` is passed by pointer + /// rather than by value, it is automatically advanced. + /// + /// Returns `best::none` on failure; in this case, `output` is not advanced. + template + constexpr best::option>> encode( + best::span>* output, const E& = {}) const; + template + constexpr best::option>> encode(best::span> output, + const E& enc = {}) const { + return encode(&output, enc); + } + + /// # `rune::decode()` + /// + /// Performs a single indivisible decoding operation. + /// + /// Returns the decoded rune. If `input` is passed by pointer rather than by + /// value, it is automatically advanced. + /// + /// Returns `best::none` on failure; in this case, `input` is not advanced. + template + constexpr static best::option decode(best::span>* input, + const E& enc = {}); + template + constexpr static best::option decode(best::span> input, + const E& enc = {}) { + return decode(&input, enc); + } + + /// # `rune::undecode()` + /// + /// Performs a single indivisible decoding operation, in reverse. + /// + /// Returns the decoded rune. If `input` is passed by pointer rather than by + /// value, it is automatically advanced. + /// + /// Returns `best::none` on failure; in this case, `input` is not advanced. + template + constexpr static best::option undecode(best::span>* input, + const E& enc = {}); + template + constexpr static best::option undecode(best::span> input, + const E& enc = {}) { + return undecode(&input, enc); + } + + /// # `rune::iter` + /// + /// An iterator over some encoded span that yields runes. The span need not + /// be well-encoded: if encoding errors are encountered, then either: + /// + /// 1. If the encoding is synchronizing, yields one `rune::replacement()` for + /// each bad code unit. + /// + /// 2. If the encoding is not synchronizing, yields one `rune::replacement()` + /// and halts further iteration. + template + struct iter; + + template + iter(S, const E&) -> iter; + template + iter(const S& s) -> iter>; + + /// # `rune::from_digit()` + /// + /// Returns the appropriate character to represent `num` in the given `radix` + /// (i.e., base). Crashes if `radix > 36`. + constexpr static best::option from_digit(uint32_t num, + uint32_t radix = 10); + + /// # `rune::is_digit()` + /// + /// Returns this is a "digit", i.e., a value matching `[0-9a-zA-Z]` and + /// is within the given `radix`. For example, if `radix` is 10, this checks + /// for whether this character matches `[0-9]`. Crashes if `radix > 36`. + constexpr bool is_digit(uint32_t radix = 10) const { + return to_digit(radix).has_value(); + } + + /// # `rune::to_digit()` + /// + /// Returns the value of this character when interpreted as a digit in the + /// given `radix`. + constexpr best::option to_digit(uint32_t radix = 10) const; + + /// # `rune::is_unpaired_surrogate()` + /// + /// Returns whether this rune is an unpaired surrogate. + constexpr bool is_unpaired_surrogate() const { return in(0xd800, 0xdfff); } + + /// # `rune::is_low_surrogate()` + /// + /// Returns whether this rune is a "low" unpaired surrogate. + constexpr bool is_low_surrogate() const { return in(0xdc00, 0xdfff); } + + /// # `rune::is_high_surrogate()` + /// + /// Returns whether this rune is an "high" unpaired surrogate. + constexpr bool is_high_surrogate() const { return in(0xd800, 0xdbff); } + + /// # `rune::is_ascii()` + /// + /// Returns whether this rune is in the ASCII range (up to U+007F) + constexpr bool is_ascii() const { return in(0x0000, 0x007f); } + + /// # `rune::is_ascii_alpha()` + /// + /// Returns whether this rune is an ASCII letter. + constexpr bool is_ascii_alpha() const { + return is_ascii_lower() || is_ascii_upper(); + } + + /// # `rune::is_ascii_alnum()` + /// + /// Returns whether this rune is an ASCII letter. or digit + constexpr bool is_ascii_alnum() const { + return is_ascii_alpha() || is_ascii_digit(); + } + + /// # `rune::is_ascii_control()` + /// + /// Returns whether this rune is an ASCII control character. This includes + /// most whitespace, except for ' ' (U+0020). + constexpr bool is_ascii_control() const { + return in(0x0000, 0x001f) || value_ == 0x007f; + } + + /// # `rune::is_ascii_digit()` + /// + /// Returns whether this rune is an ASCII digit. + constexpr bool is_ascii_digit() const { return in('0', '9'); } + + /// # `rune::is_ascii_hex()` + /// + /// Returns whether this rune is an ASCII hexadecimal digit. + constexpr bool is_ascii_hex() const { return is_digit(16); } + + /// # `rune::is_ascii_lower()` + /// + /// Returns whether this rune is an ASCII lowercase letter. + constexpr bool is_ascii_lower() const { return in('a', 'z'); } + + /// # `rune::to_ascii_lower()` + /// + /// Converts this rune to its ASCII lowercase counterpart, if it is ASCII + /// uppercase. + constexpr rune to_ascii_lower() const { + if (!is_ascii_upper()) return *this; + return rune(in_place, value_ - 'A' + 'a'); + } + + /// # `rune::is_ascii_upper()` + /// + /// Returns whether this rune is an ASCII uppercase letter. + constexpr bool is_ascii_upper() const { return in('A', 'Z'); } + + /// # `rune::to_ascii_upper()` + /// + /// Converts this rune to its ASCII uppercase counterpart, if it is ASCII + /// lowercase. + constexpr rune to_ascii_upper() const { + if (!is_ascii_lower()) return *this; + return rune(in_place, value_ - 'a' + 'A'); + } + + /// # `rune::is_ascii_punct()` + /// + /// Returns whether this rune is an ASCII punctuation character. + constexpr bool is_ascii_punct() const { + return in('!', '/') || in(':', '@') || in('[', '`') || in('{', '~'); + } + + /// # `rune::is_ascii_space()` + /// + /// Returns whether this rune is an ASCII whitespace character. + constexpr bool is_ascii_space() const; + + /// Tempoaray hack until BestFmt. + template + friend Os& operator<<(Os& os, rune r) { + char encoded[5] = {}; + best::utf_internal::encode8(encoded, r); + + return os << encoded << "/" << std::hex << r.to_int(); + } + + // best::rune has a niche representation. + constexpr rune(niche) : value_(-1) {} + constexpr bool operator==(niche) const { return value_ == -1; } + + private: + BEST_INLINE_ALWAYS constexpr bool in(uint32_t a, uint32_t b) const { + return value_ >= a && value_ <= b; + } + + static constexpr char Alphabet[] = "0123456789abcdefghijklmnopqrstuvwxyz"; + constexpr explicit rune(best::in_place_t, uint32_t value) : value_(value) {} + + uint32_t value_; +}; + +/// See `rune::iter` above. +template +struct rune::iter final { + public: + /// # `iter::sentinel` + /// + /// The end-of-iteration sentinel. + struct sentinel {}; + + /// # `iter::iter()` + /// + /// Constructs a new iterator over the given span of code units. + constexpr explicit iter(best::span> codes, const E& enc) + : codes_(codes), enc_(std::addressof(enc)) {} + + /// # `iter::iter(str)` + /// + /// Constructs a new iterator over a string type. + constexpr explicit iter(const best::string_type auto& str) + : iter(str, best::encoding_of(str)) {} + template + constexpr explicit iter(const auto (&str)[n]) + : iter(best::span(str, n - 1), best::encoding_of(str)) {} + + /// # `iter::next()` + /// + /// Advances the iterator and returns the next value. + constexpr best::option next() { + ++*this; + return next_; + } + + /// # `iter::rest()` + /// + /// Returns the part of the code unit span still left to advance. + constexpr best::span> rest() const { return codes_; } + + constexpr iter begin() const { return *this; } + constexpr sentinel end() const { return {}; } + + constexpr bool operator==(sentinel) const { + return next_.is_empty() && codes_.is_empty(); + }; + constexpr rune operator*() { + if (!next_) ++*this; + return next_.value_or(); + } + + constexpr iter& operator++() { + if (codes_.is_empty()) { + next_ = best::none; + return *this; + } + + next_ = decode(&codes_, *enc_); + if (next_) return *this; + + next_ = rune::replacement(); + if (E::About.is_self_syncing) { + codes_ = codes_[{.start = 1}]; + } else { + codes_ = best::span>(); + } + + return *this; + } + + constexpr iter operator++(int) { + auto prev = *this; + ++*this; + return prev; + } + + private: + best::option next_; + best::span> codes_; + const E* enc_; +}; + +/// --- IMPLEMENTATION DETAILS BELOW --- +constexpr best::option rune::from_int(uint32_t value) { + if (!is_unicode(value) || is_surrogate(value)) return best::none; + return rune(best::in_place, value); +} +constexpr best::option rune::from_int(int32_t value) { + return from_int(static_cast(value)); +} +constexpr best::option rune::from_int_allow_surrogates(uint32_t value) { + if (!is_unicode(value)) return best::none; + return rune(best::in_place, value); +} +constexpr best::option rune::from_int_allow_surrogates(int32_t value) { + return from_int_allow_surrogates(static_cast(value)); +} + +template +constexpr best::option rune::size(const E& enc) const { + code codes[E::About.max_codes_per_rune]; + return encode(codes, enc).map([](auto sp) { return sp.size(); }); +} + +template +constexpr best::option>> rune::encode( + best::span>* output, const E& enc) const { + auto out0 = *output; + if (enc.encode(output, *this)) { + size_t written = out0.size() - output->size(); + return out0[{.count = written}]; + } + + *output = out0; + return best::none; +} + +template +constexpr best::option rune::decode(best::span>* input, + const E& enc) { + auto in0 = *input; + if (auto encoded = enc.decode(input)) return encoded; + + *input = in0; + return best::none; +} +template +constexpr best::option rune::undecode(best::span>* input, + const E& enc) { + auto in0 = *input; + if (auto encoded = enc.undecode(input)) return encoded; + + *input = in0; + return best::none; +} + +constexpr best::option rune::from_digit(uint32_t num, uint32_t radix) { + if (radix > 36) { + crash_internal::crash("from_digit() radix too large: %u > 36", radix); + } + if (num >= radix) return best::none; + + return rune{in_place, best::to_unsigned(Alphabet[num])}; +} + +constexpr best::option rune::to_digit(uint32_t radix) const { + if (radix > 36) { + crash_internal::crash("from_digit() radix too large: %u > 36", radix); + } + uint32_t value; + if (is_ascii_digit()) { + value = value_ - '0'; + } else if (is_ascii_alpha()) { + value = to_ascii_lower().value_ - 'a' + 10; + } else { + return best::none; + } + + if (value >= radix) return best::none; + return value; +} + +constexpr bool rune::is_ascii_space() const { + switch (value_) { + case ' ': + case '\t': + case '\n': + case '\f': + case '\r': + return true; + default: + return false; + } +} + +} // namespace best + +#endif // BEST_TEXT_RUNE_H_ \ No newline at end of file diff --git a/best/text/rune_test.cc b/best/text/rune_test.cc new file mode 100644 index 0000000..79ab35e --- /dev/null +++ b/best/text/rune_test.cc @@ -0,0 +1,40 @@ + +#include "best/text/rune.h" + +#include "best/test/test.h" + +namespace best::utf_test { +best::test FromInt = [](auto& t) { + t.expect_eq(best::rune::from_int(0), 0); + t.expect_eq(best::rune::from_int('a'), 'a'); + t.expect_eq(best::rune::from_int('\x7f'), '\x7f'); + t.expect_eq(best::rune::from_int(u'Β΅'), u'Β΅'); + t.expect_eq(best::rune::from_int(u'猫'), u'猫'); + t.expect_eq(best::rune::from_int(U'🧢'), U'🧢'); + t.expect_eq(best::rune::from_int(0x10ffff), 0x10ffff); + + t.expect_eq(best::rune::from_int(0xd800), best::none); + t.expect_eq(best::rune::from_int(0xdbff), best::none); + t.expect_eq(best::rune::from_int(0xdc00), best::none); + t.expect_eq(best::rune::from_int(0xdfff), best::none); + t.expect_eq(best::rune::from_int(0x110000), best::none); + t.expect_eq(best::rune::from_int(-1), best::none); +}; + +best::test FromIntAllowSurrogates = [](auto& t) { + t.expect_eq(best::rune::from_int_allow_surrogates(0), 0); + t.expect_eq(best::rune::from_int_allow_surrogates('a'), 'a'); + t.expect_eq(best::rune::from_int_allow_surrogates('\x7f'), '\x7f'); + t.expect_eq(best::rune::from_int_allow_surrogates(u'Β΅'), u'Β΅'); + t.expect_eq(best::rune::from_int_allow_surrogates(u'猫'), u'猫'); + t.expect_eq(best::rune::from_int_allow_surrogates(U'🧢'), U'🧢'); + t.expect_eq(best::rune::from_int_allow_surrogates(0x10ffff), 0x10ffff); + + t.expect_eq(best::rune::from_int_allow_surrogates(0xd800), 0xd800); + t.expect_eq(best::rune::from_int_allow_surrogates(0xdbff), 0xdbff); + t.expect_eq(best::rune::from_int_allow_surrogates(0xdc00), 0xdc00); + t.expect_eq(best::rune::from_int_allow_surrogates(0xdfff), 0xdfff); + t.expect_eq(best::rune::from_int_allow_surrogates(0x110000), best::none); + t.expect_eq(best::rune::from_int_allow_surrogates(-1), best::none); +}; +} // namespace best::utf_test \ No newline at end of file diff --git a/best/text/str.h b/best/text/str.h new file mode 100644 index 0000000..2d02365 --- /dev/null +++ b/best/text/str.h @@ -0,0 +1,548 @@ +#ifndef BEST_TEXT_STR_H_ +#define BEST_TEXT_STR_H_ + +#include +#include +#include +#include +#include +#include + +#include "best/container/span.h" +#include "best/memory/bytes.h" +#include "best/meta/ops.h" +#include "best/text/rune.h" +#include "best/text/utf.h" + +//! Unicode strings. +//! +//! best::text is a Unicode string, i.e., an text sequence of best::runes. +//! It is essentially std::basic_string_view with a nicer API (compare with +//! best::span). +//! +//! best::str, best::str16, and best::str32 are type aliases corresponding to +//! the UTF-8/16/32 specializations of the above. + +namespace best { +template +class text; + +/// # `best::str` +/// +/// A reference to UTF-8 text data. +using str = best::text; + +/// # `best::str16` +/// +/// A reference to UTF-16 text data. +using str16 = best::text; + +/// # `best::str32` +/// +/// A reference to UTF-32 text data. +using str32 = best::text; + +/// # `BEST_IS_VALID_LITERAL()` +/// +/// A function requirement that verifies that `literal_` is a valid string +/// literal for `enc_`. +/// +/// This is intended to be placed after any `requires` clauses. +#define BEST_IS_VALID_LITERAL(literal_, enc_) \ + BEST_ENABLE_IF_CONSTEXPR(literal_) \ + BEST_ENABLE_IF( \ + rune::validate(best::span(literal_, std::size(literal_) - 1), enc_), \ + "string must be validly text") + +/// # `best::text` +/// +/// An reference to contiguous textual data. +/// +/// This is a generalized view that allows specifying the encoding of the +/// underlying data. It is similar to `std::basic_string_view`, except it uses +/// a ztd.text-style encoding trait, and provides a generally nicer interface. +/// +/// A `best::text` string can be created from a string literal; in this case, +/// it will be validated for being "correctly text" wrt to the encoding `E`. +/// It can also be constructed from a pointer, in which case no such check +/// occurs. +/// +/// A `best::text` may not point to invalidly-text data. Constructors from +/// unauthenticated strings must go through factories that return +/// `best::optional`. +template +class text final { + public: + /// # `text::encoding` + /// + /// The encoding for this string. + using encoding = E; + + /// # `text::code` + /// + /// The code unit for this encoding. This is the element type of an text + /// stream. + using code = encoding::code; + + /// # `text::About` + /// + /// Metadata about this strings's encoding. + static constexpr best::encoding_about About = E::About; + + /// # `text::text()` + /// + /// Creates a new, empty string with the given encoding. + constexpr explicit text(encoding enc = {}) + : text(in_place, best::span(&empty, 0), std::move(enc)) {} + + /// # `text::text(text)` + /// + /// Copyable and movable. + constexpr text(const text&) = default; + constexpr text& operator=(const text&) = default; + constexpr text(text&&) = default; + constexpr text& operator=(text&&) = default; + + /// # `text::text("...")` + /// + /// Creates a new string from a string literal with an optional encoding. + /// The array must be a constant, and it must contain validly-e\ncoded data. + template + constexpr text(const code (&lit)[n], encoding enc) + BEST_IS_VALID_LITERAL(lit, enc) + : text(in_place, span(lit, n - 1), std::move(enc)) {} + template + constexpr text(const code (&lit)[n]) BEST_IS_VALID_LITERAL(lit, encoding{}) + : text(in_place, span(lit, n - 1), encoding{}) {} + + /// # `text::text(unsafe)` + /// + /// Creates a new string from some other `best::string_type`. + /// + /// Crashes if the string is not correctly text:: + template + constexpr explicit text(unsafe, best::span data, + encoding enc = {}) + : text(in_place, data, std::move(enc)) {} + + /// # `text::from()` + /// + /// Creates a new string by parsing it from a span of potentially invalid + /// characters. + constexpr static best::option from(best::span data, + encoding enc = {}); + constexpr static best::option from(const string_type auto& that) { + return from(span(std::data(that), std::size(that)), + best::encoding_of(that)); + } + + /// # `text::from_nul()` + /// + /// Creates a new string by parsing it from a NUL-terminated string. It must + /// end in `code{0}`. If `data == `nullptr`, returns an empty string. + constexpr static best::option from_nul(const code* data, + encoding enc = {}) { + return from(best::span::from_nul(data), std::move(enc)); + } + + /// # `text::size()` + /// + /// Returns the size of the string, in code units. + constexpr size_t size() const { return span_.size(); } + + /// # `text::data()` + /// + /// Returns the string's data pointer. + /// This value is never null. + constexpr const code* data() const { return span_.data(); } + + /// # `text::get_encoding()` + /// + /// Returns the underlying text encoding. + constexpr const encoding& enc() const { return enc_; } + + /// # `text::is_empty()` + /// + /// Checks whether the string is empty. + constexpr bool is_empty() const { return size() == 0; } + + /// # `text::as_codes()` + /// + /// Returns the span of code units that backs this string. + constexpr best::span as_codes() const { return span_; } + + /// # `text::is_rune_boundary()` + /// + /// Returns whether or not `idx` is a rune boundary or not. Returns `false` + /// for oud-of-bounds indices. + /// + /// For stateless encodings, this is an O(1) check. For non-synchronizing + /// encodings, it is O(n). + constexpr bool is_rune_boundary(size_t idx) const; + + /// # `text[{...}]` + /// + /// Gets the substring in the given range. Crashes on out-of-bounds access + /// or, if this encoding is stateless, if `range` slices through a non-rune + /// boundary. + /// + /// Beware: this check is O(n) for non-synchronizing encoding. + constexpr text operator[](best::bounds::with_location range) const; + + /// # `text::at()` + /// + /// Gets the substring in the given range. Returns `best::none` where + /// `operator[]` would crash. + /// + /// Beware: this check is O(n) for non-synchronizing encoding. + constexpr best::option at(best::bounds range) const; + + /// # `text::at(unsafe)` + /// + /// Gets the substring in the given range, performing no bounds checks. + constexpr text at(unsafe u, best::bounds range) const { + return {in_place, span_.at(u, range)}; + } + + /// # `text::starts_with()` + /// + /// Checks whether this string begins with the specifies substring or rune. + constexpr bool starts_with(rune r) const { + return trim_prefix(r).has_value(); + } + constexpr bool starts_with(const string_type auto& s) const { + return trim_prefix(s).has_value(); + } + + /// # `text::trim_prefix()` + /// + /// If this string starts with the given prefix, returns a copy of this string + /// with that prefix removed. + constexpr best::option trim_prefix(rune) const; + constexpr best::option trim_prefix(const string_type auto&) const; + + /// # `text::consume_prefix()` + /// + /// If this string starts with the given prefix, returns `true` and updates + /// this string to the result of `trim_prefix()`. Otherwise, returns `false` + /// and leaves this string unchanged. + constexpr bool consume_prefix(rune r) { + auto suffix = trim_prefix(r); + if (suffix) *this = suffix; + return suffix.has_value(); + } + constexpr bool consume_prefix(const string_type auto& s) { + auto suffix = trim_prefix(s); + if (suffix) *this = suffix; + return suffix.has_value(); + } + + /// # `text::contains()` + /// + /// Whether this string contains a particular substring or rune.. + constexpr bool contains(rune r) const { return !!find(r); } + constexpr bool contains(const string_type auto& s) const { return !!find(s); } + + /// # `text::find()` + /// + /// Finds the first occurrence of a substring or rune within this string. + /// + /// If any invalidly text characters are encountered during the search, in + /// either the haystack or the needle, this function returns `best::none`. + constexpr best::option find(rune r) const { + return split_on(r).map([](auto r) { return r.first.size(); }); + } + constexpr best::option find(const string_type auto& s) const { + return split_on(s).map([](auto r) { return r.first.size(); }); + } + constexpr best::option find( + best::callable auto&& p) const { + return split_on(BEST_FWD(p)).map([](auto r) { return r.first.size(); }); + } + + /// # `text::split_at()` + /// + /// Splits this string into two on the given index. If the desired split point + /// is out of bounds, returns `best::none`. + constexpr best::option> split_at(size_t n) const { + auto prefix = at({.end = n}); + if (!prefix) return best::none; + return {{*prefix, operator[]({.start = n})}}; + } + + /// # `text::split_on()` + /// + /// Splits this string into two on the first occurrence of the given substring + /// or rune, or when the callback returns true. If the desired split point + /// is not found, returns `best::none`. + constexpr best::option> split_on(rune) const; + constexpr best::option> split_on( + const string_type auto&) const; + constexpr best::option> split_on( + best::callable auto&& p) const; + + /// # `text::break_off()` + /// + /// Parses the first rune in this string and returns it and a substring with + /// that rune removed. Returns `best::none` if the string is empty. + constexpr best::option> break_off() const; + + /// # `text::rune_iter`, `text::runes()`. + /// + /// An iterator over the runes of a `best::text`. + /// + /// A `best::text` may point to invalidly-text data. If the encoding is + /// self-synchronizing, the stream of Unicode characters is interpreted as + /// replacing each invalid code unit with a Unicode replacement character + /// (U+FFFD). If the encoding is not self-synchronizing, the stream is + /// interpreted to end at that position, with a replacement character. + constexpr rune::iter runes() const { return rune::iter(span_, enc_); } + + /// # `text::operator==` + /// + /// Strings can be compared regardless of encoding, and they may be compared + /// with runes, too. + constexpr bool operator==(rune) const; + constexpr bool operator==(const string_type auto&) const; + constexpr bool operator==(const text&) const = default; + constexpr bool operator==(best::span span) const { + return span_ == span; + } + constexpr bool operator==(const code* lit) const { + return span_ == best::span::from_nul(lit); + } + + // Make this into a best::string_type. + constexpr friend const encoding& BestEncoding(auto, const text& t) { + return t.enc(); + } + + // TODO: BestFmt + template + friend Os& operator<<(Os& os, text str) { + if constexpr (best::same) { + return os << std::string_view(str.data(), str.size()); + } + + char u8[utf8::About.max_codes_per_rune]; + for (rune r : str.runes()) { + if (auto chars = r.encode(u8, utf8{})) { + os << std::string_view(chars->data(), chars->size()); + continue; + } + } + return os; + } + + private: + constexpr explicit text(best::in_place_t, best::span span, + encoding enc) + : span_(span), enc_(std::move(enc)) { + if (span_.data() == nullptr) span_ = {&empty, 0}; + } + + template + static constexpr bool compatible = + best::same()))>>; + + constexpr bool can_memeq(const auto& that) const { + return !std::is_constant_evaluated() && + (best::addr_eq(this, std::addressof(that)) || + best::same_encoding(*this, that)); + } + + constexpr bool byte_comparable(const auto& that) const { + return can_memeq(that) && best::byte_comparable && + E::About.is_lexicographic; + } + + constexpr bool can_memmem(const auto& that) const { + return can_memeq(that) && best::byte_comparable && + E::About.is_self_syncing; + } + + best::span span_{&empty, 0}; + [[no_unique_address]] encoding enc_; + + static constexpr code empty{}; +}; + +/// --- IMPLEMENTATION DETAILS BELOW --- + +template +constexpr best::option> text::from(best::span data, + encoding enc) { + if (!rune::validate(data, enc)) { + return best::none; + } + + return text(best::in_place, data, std::move(enc)); +} + +template +constexpr bool text::is_rune_boundary(size_t idx) const { + return rune::is_boundary(*this, idx, enc()); +} + +template +constexpr text text::operator[](best::bounds::with_location range) const { + // First, perform a bounds check. + auto chunk = span_[range]; + + auto at_boundary = is_rune_boundary(range.start) && + is_rune_boundary(range.start + chunk.size()); + + if (!at_boundary) { + crash_internal::crash( + {"string slice operation sliced through the middle of a character: " + "{.start = %zu, .end = %zu}", + range.where}, + range.start, range.start + chunk.size()); + } + + return text{in_place, chunk, enc()}; +} + +template +constexpr option> text::at(best::bounds range) const { + auto chunk = span_.at(range); + if (!chunk) return best::none; + + auto at_boundary = is_rune_boundary(range.start) && + is_rune_boundary(range.start + chunk->size()); + if (!at_boundary) return best::none; + + return text{in_place, *chunk, enc()}; +} + +template +constexpr option> text::trim_prefix(rune r) const { + return break_off().then([=](auto x) -> option { + if (r == x.first) return x.second; + return best::none; + }); +} + +template +constexpr option> text::trim_prefix( + const string_type auto& str) const { + rune::iter needle(str); + rune::iter haystack(*this); + + if constexpr (compatible) { + if (can_memeq(str)) { + auto that = needle.rest(); + auto prefix = span_.at({.end = that.size()}); + if (!prefix) return best::none; + if (!best::equate_bytes(*prefix, that)) return none; + return operator[]({.start = that.size()}); + } + return best::none; + } + + while (auto r1 = needle.next()) { + if (r1 != haystack.next()) return best::none; + } + return text{in_place, haystack.rest(), enc()}; +} + +template +constexpr best::option, text>> text::split_on( + rune r1) const { + if (can_memmem(*this)) { + code buf[About.max_codes_per_rune]; + auto that = r1.encode(buf, enc()); + if (!that) return best::none; + + auto idx = best::search_bytes(span_, *that); + if (!idx) return best::none; + return {{ + text(in_place, span_[{.end = *idx}], enc()), + text(in_place, span_[{.start = *idx + that->size()}], enc()), + }}; + } + + return split_on([=](rune r2) { return r1 == r2; }); +} + +template +constexpr best::option, text>> text::split_on( + const string_type auto& str) const { + rune::iter haystack_start(*this); + rune::iter needle_start(str); + + if constexpr (compatible) { + if (can_memmem(str)) { + best::span that = needle_start.rest(); + auto idx = best::search_bytes(span_, that); + if (!idx) return best::none; + return {{ + text(in_place, span_[{.end = *idx}], enc()), + text(in_place, span_[{.start = *idx + that.size()}], enc()), + }}; + } + } + + auto haystack = haystack_start; + auto needle = needle_start; + while (auto n = needle.next()) { + if (haystack.next() == n) continue; + + if (!haystack_start.next()) return best::none; + haystack = haystack_start; + needle = needle_start; + } + + size_t start = size() - haystack_start.rest().size(); + size_t end = size() - haystack.rest().size(); + + return {{ + text(in_place, span_[{.end = start}], enc()), + text(in_place, span_[{.start = end}], enc()), + }}; +} + +template +constexpr best::option, text>> text::split_on( + best::callable auto&& pred) const { + best::rune::iter iter(*this); + size_t prev = 0; + while (auto next = iter.next()) { + if (!best::call(BEST_FWD(pred), *next)) { + prev = size() - iter.rest().size(); + continue; + } + + return {{ + text(in_place, span_[{.end = prev}], enc()), + text(in_place, iter.rest(), enc()), + }}; + } + + return best::none; +} + +template +constexpr bool text::operator==(rune r) const { + if (is_empty()) return false; + auto [r2, rest] = *break_off(); + return rest.is_empty() && r == r2; +} + +template +constexpr bool text::operator==(const string_type auto& s) const { + return trim_prefix(s).has_value(&text::is_empty); +} + +template +constexpr best::option>> text::break_off() const { + if (is_empty()) return best::none; + + auto suffix = *this; + auto rune = rune::decode(&suffix.span_, enc()); + return {{*rune, suffix}}; +} +} // namespace best + +#endif // BEST_TEXT_STR_H_ \ No newline at end of file diff --git a/best/text/str_test.cc b/best/text/str_test.cc new file mode 100644 index 0000000..0f771f6 --- /dev/null +++ b/best/text/str_test.cc @@ -0,0 +1,204 @@ +#include "best/text/str.h" + +#include "best/container/vec.h" +#include "best/test/test.h" + +namespace best::str_test { + +best::test Empty = [](auto& t) { + best::str s1; + t.expect_eq(s1, ""); + t.expect_eq(s1.size(), 0); + t.expect(s1.is_empty()); + + best::str s2 = ""; + t.expect_eq(s2, ""); + t.expect_eq(s2.size(), 0); + t.expect(s2.is_empty()); + + best::str s3 = *best::str::from_nul(nullptr); + t.expect_eq(s3, ""); + t.expect_eq(s3.size(), 0); + t.expect(s3.is_empty()); + + best::str s4 = *best::str::from_nul(""); + t.expect_eq(s4, ""); + t.expect_eq(s4.size(), 0); + t.expect(s4.is_empty()); +}; + +best::test Size = [](auto& t) { + best::str s = "foo"; + t.expect_eq(s.size(), 3); + t.expect(!s.is_empty()); + + best::str s2 = "foo\0foo"; + t.expect_eq(s2.size(), 7); +}; + +best::test Eq = [](auto& t) { + best::str test = "solomonπŸ§ΆπŸˆβ€β¬›ι»’ηŒ«"; + t.expect_eq(test, test); + t.expect_eq(test, "solomonπŸ§ΆπŸˆβ€β¬›ι»’ηŒ«"); + t.expect_eq(test, (const char*)"solomonπŸ§ΆπŸˆβ€β¬›ι»’ηŒ«"); + t.expect_eq(test, std::string_view("solomonπŸ§ΆπŸˆβ€β¬›ι»’ηŒ«")); + + t.expect_ne(test, best::str("solomon")); + t.expect_ne(test, "solomon"); + t.expect_ne(test, (const char*)"solomon"); + t.expect_ne(test, std::string_view("solomon")); +}; + +best::test Utf8Decode = [](auto& t) { + best::str test = "solomonπŸ§ΆπŸˆβ€β¬›ι»’ηŒ«"; + t.expect_eq(test.size(), 27); + + best::vec runes; + for (rune r : test.runes()) { + runes.push(r); + } + + t.expect_eq(runes, + best::span{'s', 'o', 'l', 'o', 'm', 'o', 'n', U'🧢', + U'🐈', 0x200d, U'⬛', U'ι»’', U'猫'}); +}; + +best::test Utf16Decode = [](auto& t) { + best::str16 test = u"solomonπŸ§ΆπŸˆβ€β¬›ι»’ηŒ«"; + t.expect_eq(test.size(), 15); + best::vec runes; + for (rune r : test.runes()) { + runes.push(r); + } + + t.expect_eq(runes, + best::span{'s', 'o', 'l', 'o', 'm', 'o', 'n', U'🧢', + U'🐈', 0x200d, U'⬛', U'ι»’', U'猫'}); +}; + +best::test Affix = [](auto& t) { + best::str haystack = "a complicated string. see solomon: πŸˆβ€β¬›"; + + t.expect(haystack.starts_with("a complicated string")); + t.expect(!haystack.starts_with("complicated string")); + t.expect(haystack.starts_with(u"a complicated string")); + t.expect(!haystack.starts_with(u"complicated string")); + t.expect(haystack.starts_with(str("a complicated string"))); + t.expect(!haystack.starts_with(str("complicated string"))); + t.expect(haystack.starts_with(str16(u"a complicated string"))); + t.expect(!haystack.starts_with(str16(u"complicated string"))); + + t.expect(haystack.starts_with('a')); + t.expect(!haystack.starts_with('z')); + t.expect(!haystack.starts_with(U'🧢')); +}; + +best::test Contains = [](auto& t) { + best::str haystack = "a complicated string. see solomon: πŸˆβ€β¬›"; + + t.expect(haystack.contains("solomon")); + t.expect(!haystack.contains("daisy")); + t.expect(haystack.contains(u"solomon")); + t.expect(!haystack.contains(u"daisy")); + + t.expect(haystack.contains(U'🐈')); + t.expect(!haystack.contains('z')); + t.expect(!haystack.contains(U'🍣')); + t.expect(haystack.contains(U"πŸˆβ€β¬›")); +}; + +best::test Find = [](auto& t) { + best::str haystack = "a complicated string. see solomon: πŸˆβ€β¬›"; + + t.expect_eq(haystack.find("solomon"), 26); + t.expect_eq(haystack.find("daisy"), best::none); + t.expect_eq(haystack.find(u"solomon"), 26); + t.expect_eq(haystack.find(u"daisy"), best::none); + + t.expect_eq(haystack.find(U'🐈'), 35); + t.expect_eq(haystack.find('z'), best::none); + t.expect_eq(haystack.find(U'🍣'), best::none); + t.expect_eq(haystack.find(U"πŸˆβ€β¬›"), 35); + + t.expect_eq(haystack.find(&rune::is_ascii_punct), 20); +}; + +best::test Find16 = [](auto& t) { + best::str16 haystack = u"a complicated string. see solomon: πŸˆβ€β¬›"; + + t.expect_eq(haystack.find("solomon"), 26); + t.expect_eq(haystack.find("daisy"), best::none); + t.expect_eq(haystack.find(u"solomon"), 26); + t.expect_eq(haystack.find(u"daisy"), best::none); + + t.expect_eq(haystack.find(U'🐈'), 35); + t.expect_eq(haystack.find('z'), best::none); + t.expect_eq(haystack.find(U'🍣'), best::none); + t.expect_eq(haystack.find(U"πŸˆβ€β¬›"), 35); + + t.expect_eq(haystack.find(&rune::is_ascii_punct), 20); +}; + +best::test SplitAt = [](auto& t) { + best::str test = "ι»’ηŒ«"; + + t.expect_eq(test.split_at(0), std::pair{"", "ι»’ηŒ«"}); + t.expect_eq(test.split_at(1), best::none); + t.expect_eq(test.split_at(2), best::none); + t.expect_eq(test.split_at(3), std::pair{"ι»’", "猫"}); + t.expect_eq(test.split_at(4), best::none); + t.expect_eq(test.split_at(5), best::none); + t.expect_eq(test.split_at(6), std::pair{"ι»’ηŒ«", ""}); + + test = "πŸˆβ€β¬›"; + + t.expect_eq(test.split_at(0), std::pair{"", "πŸˆβ€β¬›"}); + t.expect_eq(test.split_at(1), best::none); + t.expect_eq(test.split_at(2), best::none); + t.expect_eq(test.split_at(3), best::none); + t.expect_eq(test.split_at(4), std::pair{"🐈", "\u200d⬛"}); + t.expect_eq(test.split_at(5), best::none); + t.expect_eq(test.split_at(6), best::none); + t.expect_eq(test.split_at(7), std::pair{"🐈\u200d", "⬛"}); + t.expect_eq(test.split_at(8), best::none); + t.expect_eq(test.split_at(9), best::none); + t.expect_eq(test.split_at(10), std::pair{"πŸˆβ€β¬›", ""}); +}; + +best::test SplitAt16 = [](auto& t) { + best::str16 test = u"ι»’ηŒ«"; + + t.expect_eq(test.split_at(0), std::pair{u"", u"ι»’ηŒ«"}); + t.expect_eq(test.split_at(1), std::pair{u"ι»’", u"猫"}); + t.expect_eq(test.split_at(2), std::pair{u"ι»’ηŒ«", u""}); + + test = u"πŸˆβ€β¬›"; + + t.expect_eq(test.split_at(0), std::pair{u"", u"πŸˆβ€β¬›"}); + t.expect_eq(test.split_at(1), best::none); + t.expect_eq(test.split_at(2), std::pair{u"🐈", u"\u200d⬛"}); + t.expect_eq(test.split_at(3), std::pair{u"🐈\u200d", u"⬛"}); + t.expect_eq(test.split_at(4), std::pair{u"πŸˆβ€β¬›", u""}); +}; + +best::test SplitOn = [](auto& t) { + best::str haystack = "a complicated string. see solomon: πŸˆβ€β¬›"; + + t.expect_eq(haystack.split_on("solomon"), + std::pair{"a complicated string. see ", ": πŸˆβ€β¬›"}); + t.expect_eq(haystack.split_on("daisy"), best::none); + t.expect_eq(haystack.split_on(u"solomon"), + std::pair{"a complicated string. see ", ": πŸˆβ€β¬›"}); + t.expect_eq(haystack.split_on(u"daisy"), best::none); + + t.expect_eq(haystack.split_on(U'🐈'), + std::pair{"a complicated string. see solomon: ", "\u200d⬛"}); + t.expect_eq(haystack.split_on('z'), best::none); + t.expect_eq(haystack.split_on(U'🍣'), best::none); + t.expect_eq(haystack.split_on(U"πŸˆβ€β¬›"), + std::pair{"a complicated string. see solomon: ", ""}); + + t.expect_eq(haystack.split_on(&rune::is_ascii_punct), + std::pair{"a complicated string", " see solomon: πŸˆβ€β¬›"}); +}; +} // namespace best::str_test \ No newline at end of file diff --git a/best/text/utf.h b/best/text/utf.h new file mode 100644 index 0000000..5d1c142 --- /dev/null +++ b/best/text/utf.h @@ -0,0 +1,228 @@ +#ifndef BEST_TEXT_UTF_H_ +#define BEST_TEXT_UTF_H_ + +#include + +#include "best/container/option.h" +#include "best/container/span.h" +#include "best/text/internal/utf.h" +#include "best/text/rune.h" + +//! Encodings for the "Unicode Transformation Formats". +//! +//! This header defines the UTF-8, UTF-16, UTF-32, and WTF-8 encodings. + +namespace best { +/// # `best::utf8` +/// +/// A `best::encoding` representing UTF-8. +struct utf8 final { + using code = char; // Not char8_t because the standard messed up. + static constexpr best::encoding_about About{ + .max_codes_per_rune = 4, + .is_self_syncing = true, + .is_lexicographic = true, + }; + + static constexpr bool is_boundary(best::span input, size_t idx) { + return input.size() == idx || input.at(idx).has_value([](char c) { + return best::leading_ones(c) != 1; + }); + } + + static constexpr bool encode(best::span* output, rune rune) { + if (auto result = best::utf_internal::encode8(*output, rune)) { + *output = (*output)[{.start = *result}]; + return true; + } + return false; + } + + static constexpr best::option decode(best::span* input) { + if (auto result = best::utf_internal::decode8(*input)) { + *input = (*input)[{.start = result->first}]; + return rune::from_int(result->second); + } + return best::none; + } + + static constexpr best::option undecode(best::span* input) { + if (auto result = best::utf_internal::undecode8(*input)) { + *input = (*input)[{.end = input->size() - result->first}]; + return rune::from_int(result->second); + } + return best::none; + } + + constexpr bool operator==(const utf8&) const = default; +}; + +/// # `best::wtf8` +/// +/// A best::encoding representing WTF-8 (Wobbly Transformation Format). +/// +/// Its only difference with UTF-8 is that it allows decoded runes to be +/// unpaired surrogates (in the range U+D800 to U+DFFF). +struct wtf8 final { + using code = char; // Not char8_t because the standard messed up. + static constexpr best::encoding_about About{ + .max_codes_per_rune = 4, + .is_self_syncing = true, + .is_lexicographic = true, + }; + + static constexpr bool is_boundary(best::span input, size_t idx) { + return utf8::is_boundary(input, idx); + } + + static constexpr bool encode(best::span* output, rune rune) { + return utf8::encode(output, rune); + } + + static constexpr best::option decode(best::span* input) { + if (auto result = best::utf_internal::decode8(*input)) { + *input = (*input)[{.start = result->first}]; + return rune::from_int_allow_surrogates(result->second); + } + return best::none; + } + + static constexpr best::option undecode(best::span* input) { + if (auto result = best::utf_internal::undecode8(*input)) { + *input = (*input)[{.end = input->size() - result->first}]; + return rune::from_int_allow_surrogates(result->second); + } + return best::none; + } + + constexpr bool operator==(const wtf8&) const = default; +}; + +/// # `best::utf16` +/// +/// A best::encoding representing UTF-16. +struct utf16 final { + using code = char16_t; + static constexpr best::encoding_about About{ + .max_codes_per_rune = 2, + .is_self_syncing = true, + }; + + static constexpr bool is_boundary(best::span input, + size_t idx) { + return input.size() == idx || + input.at(idx) + .then([](char16_t c) { + return rune::from_int_allow_surrogates(c); + }) + .has_value([](rune r) { return !r.is_low_surrogate(); }); + } + + static constexpr bool encode(best::span* output, rune rune) { + if (auto result = best::utf_internal::encode16(*output, rune)) { + *output = (*output)[{.start = *result}]; + return true; + } + return false; + } + + static constexpr best::option decode( + best::span* input) { + if (auto result = best::utf_internal::decode16(*input)) { + *input = (*input)[{.start = result->first}]; + return rune::from_int(result->second); + } + return best::none; + } + + static constexpr best::option undecode( + best::span* input) { + if (auto result = best::utf_internal::undecode16(*input)) { + *input = (*input)[{.end = input->size() - result->first}]; + return rune::from_int(result->second); + } + return best::none; + } + + bool operator==(const utf16&) const = default; +}; + +/// # `best::utf32` +/// +/// A best::encoding representing UTF-32. +struct utf32 final { + using code = char32_t; + static constexpr best::encoding_about About{ + .max_codes_per_rune = 1, + .is_self_syncing = true, + .is_lexicographic = true, + }; + + static constexpr bool is_boundary(best::span input, + size_t idx) { + return idx <= input.size(); + } + + static constexpr bool encode(best::span* output, rune rune) { + if (auto next = output->take_first(1)) { + (*next)[0] = rune; + return true; + } + return false; + } + + static constexpr best::option decode( + best::span* input) { + if (auto next = input->take_first(1)) { + return rune::from_int((*next)[0]); + } + return best::none; + } + + static constexpr best::option undecode( + best::span* input) { + if (auto next = input->take_last(1)) { + return rune::from_int((*next)[0]); + } + return best::none; + } + + constexpr bool operator==(const utf32&) const = default; +}; + +constexpr const utf8& BestEncoding(auto, const std::string&) { + return best::val::value; +} +constexpr const utf8& BestEncoding(auto, const std::string_view&) { + return best::val::value; +} +template +constexpr const utf8& BestEncoding(auto, const char (&)[n]) { + return best::val::value; +} + +constexpr const utf16& BestEncoding(auto, const std::u16string&) { + return best::val::value; +} +constexpr const utf16& BestEncoding(auto, const std::u16string_view&) { + return best::val::value; +} +template +constexpr const utf16& BestEncoding(auto, const char16_t (&)[n]) { + return best::val::value; +} + +constexpr const utf32& BestEncoding(auto, const std::u32string&) { + return best::val::value; +} +constexpr const utf32& BestEncoding(auto, const std::u32string_view&) { + return best::val::value; +} +template +constexpr const utf32& BestEncoding(auto, const char32_t (&)[n]) { + return best::val::value; +} + +} // namespace best + +#endif // BEST_TEXT_UTF_H_ \ No newline at end of file diff --git a/best/text/utf_test.cc b/best/text/utf_test.cc new file mode 100644 index 0000000..88723fd --- /dev/null +++ b/best/text/utf_test.cc @@ -0,0 +1,111 @@ +#include "best/text/utf.h" + +#include "best/test/test.h" +#include "best/text/rune.h" + +namespace best::utf_test { +best::test Utf8Encode = [](auto& t) { + using S = best::span; + char buf[4]; + + t.expect_eq(rune('\0').encode(buf), S{0}); + t.expect_eq(rune('a').encode(buf), S{'a'}); + t.expect_eq(rune(0x7f).encode(buf), S{0x7f}); + t.expect_eq(rune(u'Β΅').encode(buf), S{0b110'00010, 0b10'110101}); + t.expect_eq(rune(u'猫').encode(buf), + S{0b1110'0111, 0b10'001100, 0b10'101011}); + t.expect_eq(rune(U'🧢').encode(buf), + S{0b11110'000, 0b10'011111, 0b10'100111, 0b10'110110}); +}; + +best::test Utf8Decode = [](auto& t) { + using S = best::span; + + t.expect_eq(rune::decode(S{0}), '\0'); + t.expect_eq(rune::decode(S{'a'}), 'a'); + t.expect_eq(rune::decode(S{0x7f}), 0x7f); + t.expect_eq(rune::decode(S{0b110'00010, 0b10'110101}), u'Β΅'); + t.expect_eq(rune::decode(S{0b1110'0111, 0b10'001100, 0b10'101011}), u'猫'); + t.expect_eq( + rune::decode(S{0b11110'000, 0b10'011111, 0b10'100111, 0b10'110110}, + utf8{}), + U'🧢'); + + // Over-long encodings are forbidden. + t.expect_eq(rune::decode(S{0b1100'0000, 0b1000'0000}), best::none); + + // Encoding unpaired surrogates is forbidden. + t.expect_eq(rune::decode(S{0b1110'1101, 0b1010'0001, 0b1011'0111}), + best::none); + // But wtf8 is ok with that. + t.expect_eq(rune::decode(S{0b1110'1101, 0b1010'0001, 0b1011'0111}, wtf8{}), + 0xd877); + + // This is the largest value accepted by utf8 and wtf8. + t.expect_eq( + rune::decode(S{0b1111'0100, 0b1000'1111, 0b1011'1111, 0b1011'1111}, + utf8{}), + 0x10ffff); + t.expect_eq( + rune::decode(S{0b1111'0100, 0b1000'1111, 0b1011'1111, 0b1011'1111}, + wtf8{}), + 0x10ffff); + + t.expect_eq( + rune::decode(S{0b1111'0100, 0b1001'0000, 0b1000'0000, 0b1000'0000}, + utf8{}), + best::none); + t.expect_eq( + rune::decode(S{0b1111'0100, 0b1001'0000, 0b1000'0000, 0b1000'0000}, + wtf8{}), + best::none); +}; + +best::test Utf16Encode = [](auto& t) { + using S = best::span; + char16_t buf[2]; + + t.expect_eq(rune('\0').encode(buf), S{0}); + t.expect_eq(rune('a').encode(buf), S{'a'}); + t.expect_eq(rune(0x7f).encode(buf), S{0x7f}); + t.expect_eq(rune(u'Β΅').encode(buf), S{u'Β΅'}); + t.expect_eq(rune(u'猫').encode(buf), S{u'猫'}); + t.expect_eq(rune(U'🧢').encode(buf), + S{0b1101100000111110, 0b1101110111110110}); +}; + +best::test Utf16Decode = [](auto& t) { + using S = best::span; + + t.expect_eq(rune::decode(S{0}), '\0'); + t.expect_eq(rune::decode(S{'a'}), 'a'); + t.expect_eq(rune::decode(S{0x7f}), 0x7f); + t.expect_eq(rune::decode(S{u'Β΅'}), u'Β΅'); + t.expect_eq(rune::decode(S{u'猫'}), u'猫'); + t.expect_eq(rune::decode(S{0b1101100000111110, 0b1101110111110110}), + U'🧢'); +}; + +best::test Utf32Encode = [](auto& t) { + using S = best::span; + char32_t buf[1]; + + t.expect_eq(rune('\0').encode(buf), S{0}); + t.expect_eq(rune('a').encode(buf), S{'a'}); + t.expect_eq(rune(0x7f).encode(buf), S{0x7f}); + t.expect_eq(rune(u'Β΅').encode(buf), S{u'Β΅'}); + t.expect_eq(rune(u'猫').encode(buf), S{u'猫'}); + t.expect_eq(rune(U'🧢').encode(buf), S{U'🧢'}); +}; + +best::test Utf32Decode = [](auto& t) { + using S = best::span; + + t.expect_eq(rune::decode(S{0}), '\0'); + t.expect_eq(rune::decode(S{'a'}), 'a'); + t.expect_eq(rune::decode(S{0x7f}), 0x7f); + t.expect_eq(rune::decode(S{u'Β΅'}), u'Β΅'); + t.expect_eq(rune::decode(S{u'猫'}), u'猫'); + t.expect_eq(rune::decode(S{U'🧢'}), U'🧢'); +}; +} // namespace best::utf_test \ No newline at end of file