diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b1fadd..9e0cdbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ if(${Python3_Interpreter_FOUND}) set(UNICODE_GENERATED_FILES ${CMAKE_CURRENT_LIST_DIR}/lib/src/unicode_mappings.cpp - ${CMAKE_CURRENT_LIST_DIR}/lib/inc/sys_string/impl/unicode/mappings_params.h + ${CMAKE_CURRENT_LIST_DIR}/lib/inc/sys_string/impl/unicode/mappings.h ${CMAKE_CURRENT_LIST_DIR}/test/test_grapheme_data.h ) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index a85ffc6..d127f68 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -94,7 +94,6 @@ set(UNICODE_FILES ${SRCDIR}/inc/sys_string/impl/unicode/utf_util.h ${SRCDIR}/inc/sys_string/impl/unicode/algorithms.h ${SRCDIR}/inc/sys_string/impl/unicode/mappings.h - ${SRCDIR}/inc/sys_string/impl/unicode/mappings_params.h ) set(SOURCES ${SRCDIR}/src/unicode_mappings.cpp diff --git a/lib/inc/sys_string/impl/unicode/algorithms.h b/lib/inc/sys_string/impl/unicode/algorithms.h index ad00669..872f93e 100644 --- a/lib/inc/sys_string/impl/unicode/algorithms.h +++ b/lib/inc/sys_string/impl/unicode/algorithms.h @@ -62,19 +62,19 @@ namespace sysstr { auto operator()(char32_t c) const noexcept -> bool { - return util::unicode::isspace(c); + return util::unicode::is_whitespace::test(c); } }; template struct casefold { - static constexpr auto max_output_length = util::unicode::mapper::max_mapped_length; + static constexpr auto max_output_length = util::unicode::case_fold_mapper::max_mapped_length; template> OutIt> auto operator()(char32_t c, OutIt dest) const noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt { - return util::unicode::mapper::case_fold.map_char(c, dest); + return util::unicode::case_fold_mapper::map_char(c, dest); } }; @@ -132,7 +132,7 @@ namespace sysstr auto c = *first; if (c != U'\u03A3') // not Σ { - dest = mapper::to_lower_case.map_char(c, dest); + dest = to_lower_case_mapper::map_char(c, dest); } else { @@ -161,7 +161,7 @@ namespace sysstr for( ; first != last; ++first) { auto c = *first; - dest = mapper::to_upper_case.map_char(c, dest); + dest = to_upper_case_mapper::map_char(c, dest); } return dest; diff --git a/lib/inc/sys_string/impl/unicode/mappings.h b/lib/inc/sys_string/impl/unicode/mappings.h index 726773b..7120aa1 100644 --- a/lib/inc/sys_string/impl/unicode/mappings.h +++ b/lib/inc/sys_string/impl/unicode/mappings.h @@ -1,3 +1,5 @@ +//THIS FILE IS GENERATED. PLEASE DO NOT EDIT DIRECTLY + // // Copyright 2020 Eugene Gershnik // @@ -9,105 +11,256 @@ #define HEADER_SYS_STRING_UNICODE_MAPPINGS_H_INCLUDED #include - -#include - + #include #include #include +#include +#include -namespace sysstr::util +namespace sysstr::util::unicode { - namespace unicode + struct char_lookup + { + char32_t offset:11; + char32_t value:21; + }; + + template + class mapper { - class mapper : public mapper_data + public: + template + static auto map_char(char32_t src, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt { - public: - static const mapper case_fold; - static const mapper to_lower_case; - static const mapper to_upper_case; - - template - auto map_char(char32_t src, OutIt dest) const noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt + const char_lookup * const lookup_start = Derived::source_chars.data(); + const char_lookup * const lookup_end = lookup_start + Derived::source_chars.size() - 1; + const char16_t * const mapped = Derived::mapped_chars; + + auto lower = std::lower_bound(lookup_start, lookup_end, char_lookup{0, src}, [](char_lookup lhs, char_lookup rhs) { + return lhs.value < rhs.value; + }); + if (lower == lookup_end || lower->value != src) + return write(src, dest); + auto start = lower->offset; + auto end = (++lower)->offset; //safe - there is one behind end + return write(mapped + start, mapped + end, dest); + } + private: + template + static auto write(char32_t c, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt + { + if constexpr (Enc == utf32) { - auto lower = std::lower_bound(m_lookup_start, m_lookup_end, char_lookup{0, src}, [](char_lookup lhs, char_lookup rhs) { - return lhs.value < rhs.value; - }); - if (lower == m_lookup_end || lower->value != src) - return write(src, dest); - auto start = lower->offset; - auto end = (++lower)->offset; //safe - there is one behind end - return write(m_mapped + start, m_mapped + end, dest); + *dest++ = c; + return dest; } - private: - constexpr mapper(const char_lookup * lookup_start, size_t lookup_len, const char16_t * mapped) noexcept: - m_lookup_start(lookup_start), - m_lookup_end(lookup_start + lookup_len - 1), - m_mapped(mapped) - {} - template - auto write(char32_t c, OutIt dest) const noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt + else { - if constexpr (Enc == utf32) - { - *dest++ = c; - return dest; - } - else - { - utf_codepoint_encoder encoder; - encoder.put(c); - return std::copy(encoder.begin(), encoder.end(), dest); - } + utf_codepoint_encoder encoder; + encoder.put(c); + return std::copy(encoder.begin(), encoder.end(), dest); } - template - auto write(const char16_t * begin, const char16_t * end, OutIt dest) const noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt + } + + template + static auto write(const char16_t * begin, const char16_t * end, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt + { + if constexpr (Enc == utf16) { - if constexpr (Enc == utf16) - { - return std::copy(begin ,end, dest); - } - else + return std::copy(begin ,end, dest); + } + else + { + utf_codepoint_decoder decoder; + while(begin != end) { - utf_codepoint_decoder decoder; - while(begin != end) - { - decoder.put(*begin++); - if (!decoder.done()) - decoder.put(m_mapped[*begin++]); //no need to bounds check, we know end is good - dest = write(decoder.value(), dest); - } - return dest; + decoder.put(*begin++); + if (!decoder.done()) + decoder.put(*begin++); //no need to bounds check, we know end is good + dest = write(decoder.value(), dest); } + return dest; } - private: - const char_lookup * m_lookup_start; - const char_lookup * m_lookup_end; - const char16_t * m_mapped; - }; + } + }; - const inline mapper mapper::case_fold{case_fold_data::source_chars, case_fold_data::source_chars_len, case_fold_data::chars}; - const inline mapper mapper::to_lower_case(to_lower_case_data::source_chars, to_lower_case_data::source_chars_len, to_lower_case_data::chars); - const inline mapper mapper::to_upper_case(to_upper_case_data::source_chars, to_upper_case_data::source_chars_len, to_upper_case_data::chars); + class case_fold_mapper : public mapper + { + friend mapper; + private: + static const std::array source_chars; + static const char16_t mapped_chars[1960]; + public: + static constexpr size_t max_mapped_length = 3; + static constexpr size_t data_size = sizeof(source_chars) + sizeof(mapped_chars); + }; + + + class to_lower_case_mapper : public mapper + { + friend mapper; + private: + static const std::array source_chars; + static const char16_t mapped_chars[1744]; + public: + static constexpr size_t max_mapped_length = 2; + static constexpr size_t data_size = sizeof(source_chars) + sizeof(mapped_chars); + }; - inline bool isspace(char32_t c) noexcept + + class to_upper_case_mapper : public mapper + { + friend mapper; + private: + static const std::array source_chars; + static const char16_t mapped_chars[1953]; + public: + static constexpr size_t max_mapped_length = 3; + static constexpr size_t data_size = sizeof(source_chars) + sizeof(mapped_chars); + }; + + + template + class lookup + { + public: + static bool test(char32_t c) noexcept { - extern const char16_t whitespaces[]; - if (c > 0xFFFFu) + if (c > Derived::max_char) return false; - for(auto p = whitespaces; *p; ++p) + for(auto p = Derived::chars; *p; ++p) if (*p == c) return true; return false; } + }; - - using case_prop = case_prop_lookup; - using grapheme_cluster_break_prop = grapheme_cluster_break_lookup; - } -} + class is_whitespace : public lookup + { + friend lookup; + private: + static const char16_t chars[26]; + public: + static constexpr char32_t max_char = U'\u3000'; + + static constexpr size_t data_size = sizeof(chars); + }; + template + class prop_lookup + { + public: + static auto get(char32_t c) noexcept + { + size_t idx = Derived::values.size(); + + { + int char_idx = (c >> 20) & 0xF; + auto & entry = Derived::entries[idx]; + idx = entry[char_idx]; + } + + { + int char_idx = (c >> 16) & 0xF; + auto & entry = Derived::entries[idx]; + idx = entry[char_idx]; + } + + { + int char_idx = (c >> 12) & 0xF; + auto & entry = Derived::entries[idx]; + idx = entry[char_idx]; + } + + { + int char_idx = (c >> 8) & 0xF; + auto & entry = Derived::entries[idx]; + idx = entry[char_idx]; + } + + { + int char_idx = (c >> 4) & 0xF; + auto & entry = Derived::entries[idx]; + idx = entry[char_idx]; + } + + { + int char_idx = (c >> 0) & 0xF; + auto & entry = Derived::entries[idx]; + idx = entry[char_idx]; + } + + assert(idx < Derived::values.size()); + return typename Derived::value(Derived::values[idx]); + } + }; + + + class case_prop : public prop_lookup + { + friend prop_lookup; + private: + using entry_type = std::array; + using value_type = uint8_t; + + static const std::array entries; + + static const std::array values; + + public: + enum value : value_type + { + none = 0, + cased = 1, + case_ignorable = 2 + }; + + static constexpr size_t data_size = sizeof(entries) + sizeof(values); + }; + + + class grapheme_cluster_break_prop : public prop_lookup + { + friend prop_lookup; + private: + using entry_type = std::array; + using value_type = uint8_t; + + static const std::array entries; + + static const std::array values; + + public: + enum value : value_type + { + none = 0, + control = 1, + extend = 2, + regional_indicator = 3, + prepend = 4, + spacing_mark = 5, + hangul_l = 6, + hangul_v = 7, + hangul_t = 8, + hangul_lv = 9, + hangul_lvt = 10, + extended_pictographic = 11, + in_cb_consonant = 16, + in_cb_extend = 32, + in_cb_linker = 48, + + basic_mask = 15, + in_cb_mask = 48 + }; + + static constexpr size_t data_size = sizeof(entries) + sizeof(values); + }; + +} + #endif + diff --git a/lib/inc/sys_string/impl/unicode/mappings_params.h b/lib/inc/sys_string/impl/unicode/mappings_params.h deleted file mode 100644 index eeeced3..0000000 --- a/lib/inc/sys_string/impl/unicode/mappings_params.h +++ /dev/null @@ -1,166 +0,0 @@ -//THIS FILE IS GENERATED. PLEASE DO NOT EDIT DIRECTLY - -// -// Copyright 2020 Eugene Gershnik -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file or at -// https://github.com/gershnik/sys_string/blob/master/LICENSE -// -#ifndef HEADER_SYS_STRING_UNICODE_MAPPINGS_DATA_H_INCLUDED -#define HEADER_SYS_STRING_UNICODE_MAPPINGS_DATA_H_INCLUDED - -#include -#include - -namespace sysstr::util::unicode -{ - struct mapper_data - { - static constexpr size_t max_mapped_length = 3; - }; - - struct char_lookup - { - char32_t offset:11; - char32_t value:21; - }; - - struct case_fold_data - { - static const char_lookup source_chars[]; - static constexpr size_t source_chars_len = 1557; - static const char16_t chars[]; - }; - - - struct to_lower_case_data - { - static const char_lookup source_chars[]; - static constexpr size_t source_chars_len = 1460; - static const char16_t chars[]; - }; - - - struct to_upper_case_data - { - static const char_lookup source_chars[]; - static constexpr size_t source_chars_len = 1552; - static const char16_t chars[]; - }; - - - template - class prop_lookup - { - public: - static auto get(char32_t c) noexcept - { - size_t idx = Derived::values.size(); - - { - int char_idx = (c >> 20) & 0xF; - auto & entry = Derived::entries[idx]; - idx = entry[char_idx]; - } - - { - int char_idx = (c >> 16) & 0xF; - auto & entry = Derived::entries[idx]; - idx = entry[char_idx]; - } - - { - int char_idx = (c >> 12) & 0xF; - auto & entry = Derived::entries[idx]; - idx = entry[char_idx]; - } - - { - int char_idx = (c >> 8) & 0xF; - auto & entry = Derived::entries[idx]; - idx = entry[char_idx]; - } - - { - int char_idx = (c >> 4) & 0xF; - auto & entry = Derived::entries[idx]; - idx = entry[char_idx]; - } - - { - int char_idx = (c >> 0) & 0xF; - auto & entry = Derived::entries[idx]; - idx = entry[char_idx]; - } - - assert(idx < Derived::values.size()); - return typename Derived::value(Derived::values[idx]); - } - }; - - - class case_prop_lookup : public prop_lookup - { - friend prop_lookup; - private: - using entry_type = std::array; - using value_type = uint8_t; - - static const std::array entries; - - static const std::array values; - - public: - enum value : value_type - { - none = 0, - cased = 1, - case_ignorable = 2 - }; - - static constexpr size_t data_size = sizeof(entries) + sizeof(values); - }; - - - class grapheme_cluster_break_lookup : public prop_lookup - { - friend prop_lookup; - private: - using entry_type = std::array; - using value_type = uint8_t; - - static const std::array entries; - - static const std::array values; - - public: - enum value : value_type - { - none = 0, - control = 1, - extend = 2, - regional_indicator = 3, - prepend = 4, - spacing_mark = 5, - hangul_l = 6, - hangul_v = 7, - hangul_t = 8, - hangul_lv = 9, - hangul_lvt = 10, - extended_pictographic = 11, - in_cb_consonant = 16, - in_cb_extend = 32, - in_cb_linker = 48, - - basic_mask = 15, - in_cb_mask = 48 - }; - - static constexpr size_t data_size = sizeof(entries) + sizeof(values); - }; - -} - -#endif - diff --git a/lib/src/unicode_mappings.cpp b/lib/src/unicode_mappings.cpp index 43ecf69..ca5bf6a 100644 --- a/lib/src/unicode_mappings.cpp +++ b/lib/src/unicode_mappings.cpp @@ -13,7 +13,7 @@ namespace sysstr::util::unicode { - const char_lookup case_fold_data::source_chars[] = { + const std::array case_fold_mapper::source_chars = {{ {0 ,U'\u0041'}, {1 ,U'\u0042'}, {2 ,U'\u0043'}, {3 ,U'\u0044'}, {4 ,U'\u0045'}, {5 ,U'\u0046'}, {6 ,U'\u0047'}, {7 ,U'\u0048'}, {8 ,U'\u0049'}, {9 ,U'\u004A'}, {10 ,U'\u004B'}, {11 ,U'\u004C'}, {12 ,U'\u004D'}, {13 ,U'\u004E'}, {14 ,U'\u004F'}, {15 ,U'\u0050'}, {16 ,U'\u0051'}, {17 ,U'\u0052'}, {18 ,U'\u0053'}, {19 ,U'\u0054'}, {20 ,U'\u0055'}, {21 ,U'\u0056'}, {22 ,U'\u0057'}, {23 ,U'\u0058'}, {24 ,U'\u0059'}, {25 ,U'\u005A'}, {26 ,U'\u00B5'}, {27 ,U'\u00C0'}, {28 ,U'\u00C1'}, {29 ,U'\u00C2'}, {30 ,U'\u00C3'}, {31 ,U'\u00C4'}, {32 ,U'\u00C5'}, {33 ,U'\u00C6'}, {34 ,U'\u00C7'}, {35 ,U'\u00C8'}, {36 ,U'\u00C9'}, {37 ,U'\u00CA'}, {38 ,U'\u00CB'}, {39 ,U'\u00CC'}, {40 ,U'\u00CD'}, {41 ,U'\u00CE'}, {42 ,U'\u00CF'}, {43 ,U'\u00D0'}, {44 ,U'\u00D1'}, {45 ,U'\u00D2'}, {46 ,U'\u00D3'}, {47 ,U'\u00D4'}, @@ -112,9 +112,9 @@ namespace sysstr::util::unicode {1885 ,U'\U00016E5D'}, {1887 ,U'\U00016E5E'}, {1889 ,U'\U00016E5F'}, {1891 ,U'\U0001E900'}, {1893 ,U'\U0001E901'}, {1895 ,U'\U0001E902'}, {1897 ,U'\U0001E903'}, {1899 ,U'\U0001E904'}, {1901 ,U'\U0001E905'}, {1903 ,U'\U0001E906'}, {1905 ,U'\U0001E907'}, {1907 ,U'\U0001E908'}, {1909 ,U'\U0001E909'}, {1911 ,U'\U0001E90A'}, {1913 ,U'\U0001E90B'}, {1915 ,U'\U0001E90C'}, {1917 ,U'\U0001E90D'}, {1919 ,U'\U0001E90E'}, {1921 ,U'\U0001E90F'}, {1923 ,U'\U0001E910'}, {1925 ,U'\U0001E911'}, {1927 ,U'\U0001E912'}, {1929 ,U'\U0001E913'}, {1931 ,U'\U0001E914'}, {1933 ,U'\U0001E915'}, {1935 ,U'\U0001E916'}, {1937 ,U'\U0001E917'}, {1939 ,U'\U0001E918'}, {1941 ,U'\U0001E919'}, {1943 ,U'\U0001E91A'}, {1945 ,U'\U0001E91B'}, {1947 ,U'\U0001E91C'}, {1949 ,U'\U0001E91D'}, {1951 ,U'\U0001E91E'}, {1953 ,U'\U0001E91F'}, {1955 ,U'\U0001E920'}, {1957 ,U'\U0001E921'}, {1959 , 0} - }; + }}; - const char16_t case_fold_data::chars[] = + const char16_t case_fold_mapper::mapped_chars[] = u"\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068\u0069\u006A\u006B\u006C\u006D\u006E\u006F\u0070" u"\u0071\u0072\u0073\u0074\u0075\u0076\u0077\u0078\u0079\u007A\u03BC\u00E0\u00E1\u00E2\u00E3\u00E4" u"\u00E5\u00E6\u00E7\u00E8\u00E9\u00EA\u00EB\u00EC\u00ED\u00EE\u00EF\u00F0\u00F1\u00F2\u00F3\u00F4" @@ -222,8 +222,7 @@ namespace sysstr::util::unicode u"\U0001E937\U0001E938\U0001E939\U0001E93A\U0001E93B\U0001E93C\U0001E93D\U0001E93E\U0001E93F\U0001E940\U0001E941\U0001E942\U0001E943" ; - - const char_lookup to_lower_case_data::source_chars[] = { + const std::array to_lower_case_mapper::source_chars = {{ {0 ,U'\u0041'}, {1 ,U'\u0042'}, {2 ,U'\u0043'}, {3 ,U'\u0044'}, {4 ,U'\u0045'}, {5 ,U'\u0046'}, {6 ,U'\u0047'}, {7 ,U'\u0048'}, {8 ,U'\u0049'}, {9 ,U'\u004A'}, {10 ,U'\u004B'}, {11 ,U'\u004C'}, {12 ,U'\u004D'}, {13 ,U'\u004E'}, {14 ,U'\u004F'}, {15 ,U'\u0050'}, {16 ,U'\u0051'}, {17 ,U'\u0052'}, {18 ,U'\u0053'}, {19 ,U'\u0054'}, {20 ,U'\u0055'}, {21 ,U'\u0056'}, {22 ,U'\u0057'}, {23 ,U'\u0058'}, {24 ,U'\u0059'}, {25 ,U'\u005A'}, {26 ,U'\u00C0'}, {27 ,U'\u00C1'}, {28 ,U'\u00C2'}, {29 ,U'\u00C3'}, {30 ,U'\u00C4'}, {31 ,U'\u00C5'}, {32 ,U'\u00C6'}, {33 ,U'\u00C7'}, {34 ,U'\u00C8'}, {35 ,U'\u00C9'}, {36 ,U'\u00CA'}, {37 ,U'\u00CB'}, {38 ,U'\u00CC'}, {39 ,U'\u00CD'}, {40 ,U'\u00CE'}, {41 ,U'\u00CF'}, {42 ,U'\u00D0'}, {43 ,U'\u00D1'}, {44 ,U'\u00D2'}, {45 ,U'\u00D3'}, {46 ,U'\u00D4'}, {47 ,U'\u00D5'}, @@ -316,9 +315,9 @@ namespace sysstr::util::unicode {1671 ,U'\U00016E5E'}, {1673 ,U'\U00016E5F'}, {1675 ,U'\U0001E900'}, {1677 ,U'\U0001E901'}, {1679 ,U'\U0001E902'}, {1681 ,U'\U0001E903'}, {1683 ,U'\U0001E904'}, {1685 ,U'\U0001E905'}, {1687 ,U'\U0001E906'}, {1689 ,U'\U0001E907'}, {1691 ,U'\U0001E908'}, {1693 ,U'\U0001E909'}, {1695 ,U'\U0001E90A'}, {1697 ,U'\U0001E90B'}, {1699 ,U'\U0001E90C'}, {1701 ,U'\U0001E90D'}, {1703 ,U'\U0001E90E'}, {1705 ,U'\U0001E90F'}, {1707 ,U'\U0001E910'}, {1709 ,U'\U0001E911'}, {1711 ,U'\U0001E912'}, {1713 ,U'\U0001E913'}, {1715 ,U'\U0001E914'}, {1717 ,U'\U0001E915'}, {1719 ,U'\U0001E916'}, {1721 ,U'\U0001E917'}, {1723 ,U'\U0001E918'}, {1725 ,U'\U0001E919'}, {1727 ,U'\U0001E91A'}, {1729 ,U'\U0001E91B'}, {1731 ,U'\U0001E91C'}, {1733 ,U'\U0001E91D'}, {1735 ,U'\U0001E91E'}, {1737 ,U'\U0001E91F'}, {1739 ,U'\U0001E920'}, {1741 ,U'\U0001E921'}, {1743 , 0} - }; + }}; - const char16_t to_lower_case_data::chars[] = + const char16_t to_lower_case_mapper::mapped_chars[] = u"\u0061\u0062\u0063\u0064\u0065\u0066\u0067\u0068\u0069\u006A\u006B\u006C\u006D\u006E\u006F\u0070" u"\u0071\u0072\u0073\u0074\u0075\u0076\u0077\u0078\u0079\u007A\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5" u"\u00E6\u00E7\u00E8\u00E9\u00EA\u00EB\u00EC\u00ED\u00EE\u00EF\u00F0\u00F1\u00F2\u00F3\u00F4\u00F5" @@ -413,8 +412,7 @@ namespace sysstr::util::unicode u"\U0001E93F\U0001E940\U0001E941\U0001E942\U0001E943" ; - - const char_lookup to_upper_case_data::source_chars[] = { + const std::array to_upper_case_mapper::source_chars = {{ {0 ,U'\u0061'}, {1 ,U'\u0062'}, {2 ,U'\u0063'}, {3 ,U'\u0064'}, {4 ,U'\u0065'}, {5 ,U'\u0066'}, {6 ,U'\u0067'}, {7 ,U'\u0068'}, {8 ,U'\u0069'}, {9 ,U'\u006A'}, {10 ,U'\u006B'}, {11 ,U'\u006C'}, {12 ,U'\u006D'}, {13 ,U'\u006E'}, {14 ,U'\u006F'}, {15 ,U'\u0070'}, {16 ,U'\u0071'}, {17 ,U'\u0072'}, {18 ,U'\u0073'}, {19 ,U'\u0074'}, {20 ,U'\u0075'}, {21 ,U'\u0076'}, {22 ,U'\u0077'}, {23 ,U'\u0078'}, {24 ,U'\u0079'}, {25 ,U'\u007A'}, {26 ,U'\u00B5'}, {27 ,U'\u00DF'}, {29 ,U'\u00E0'}, {30 ,U'\u00E1'}, {31 ,U'\u00E2'}, {32 ,U'\u00E3'}, {33 ,U'\u00E4'}, {34 ,U'\u00E5'}, {35 ,U'\u00E6'}, {36 ,U'\u00E7'}, {37 ,U'\u00E8'}, {38 ,U'\u00E9'}, {39 ,U'\u00EA'}, {40 ,U'\u00EB'}, {41 ,U'\u00EC'}, {42 ,U'\u00ED'}, {43 ,U'\u00EE'}, {44 ,U'\u00EF'}, {45 ,U'\u00F0'}, {46 ,U'\u00F1'}, {47 ,U'\u00F2'}, {48 ,U'\u00F3'}, @@ -512,9 +510,9 @@ namespace sysstr::util::unicode {1856 ,U'\U00016E72'}, {1858 ,U'\U00016E73'}, {1860 ,U'\U00016E74'}, {1862 ,U'\U00016E75'}, {1864 ,U'\U00016E76'}, {1866 ,U'\U00016E77'}, {1868 ,U'\U00016E78'}, {1870 ,U'\U00016E79'}, {1872 ,U'\U00016E7A'}, {1874 ,U'\U00016E7B'}, {1876 ,U'\U00016E7C'}, {1878 ,U'\U00016E7D'}, {1880 ,U'\U00016E7E'}, {1882 ,U'\U00016E7F'}, {1884 ,U'\U0001E922'}, {1886 ,U'\U0001E923'}, {1888 ,U'\U0001E924'}, {1890 ,U'\U0001E925'}, {1892 ,U'\U0001E926'}, {1894 ,U'\U0001E927'}, {1896 ,U'\U0001E928'}, {1898 ,U'\U0001E929'}, {1900 ,U'\U0001E92A'}, {1902 ,U'\U0001E92B'}, {1904 ,U'\U0001E92C'}, {1906 ,U'\U0001E92D'}, {1908 ,U'\U0001E92E'}, {1910 ,U'\U0001E92F'}, {1912 ,U'\U0001E930'}, {1914 ,U'\U0001E931'}, {1916 ,U'\U0001E932'}, {1918 ,U'\U0001E933'}, {1920 ,U'\U0001E934'}, {1922 ,U'\U0001E935'}, {1924 ,U'\U0001E936'}, {1926 ,U'\U0001E937'}, {1928 ,U'\U0001E938'}, {1930 ,U'\U0001E939'}, {1932 ,U'\U0001E93A'}, {1934 ,U'\U0001E93B'}, {1936 ,U'\U0001E93C'}, {1938 ,U'\U0001E93D'}, {1940 ,U'\U0001E93E'}, {1942 ,U'\U0001E93F'}, {1944 ,U'\U0001E940'}, {1946 ,U'\U0001E941'}, {1948 ,U'\U0001E942'}, {1950 ,U'\U0001E943'}, {1952 , 0} - }; + }}; - const char16_t to_upper_case_data::chars[] = + const char16_t to_upper_case_mapper::mapped_chars[] = u"\u0041\u0042\u0043\u0044\u0045\u0046\u0047\u0048\u0049\u004A\u004B\u004C\u004D\u004E\u004F\u0050" u"\u0051\u0052\u0053\u0054\u0055\u0056\u0057\u0058\u0059\u005A\u039C\u0053\u0053\u00C0\u00C1\u00C2" u"\u00C3\u00C4\u00C5\u00C6\u00C7\u00C8\u00C9\u00CA\u00CB\u00CC\u00CD\u00CE\u00CF\u00D0\u00D1\u00D2" @@ -622,12 +620,11 @@ namespace sysstr::util::unicode u"\U0001E91C\U0001E91D\U0001E91E\U0001E91F\U0001E920\U0001E921" ; - - extern const char16_t whitespaces[] = + const char16_t is_whitespace::chars[] = u"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006" u"\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"; - const std::array case_prop_lookup::entries = {{ + const std::array case_prop::entries = {{ {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, {{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}, {{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}}, {{5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, {{6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0}}, {{9, 10, 11, 12, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 14}}, {{15, 16, 0, 17, 0, 0, 18, 0, 0, 0, 19, 20, 21, 22, 23, 24}}, {{25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, @@ -762,11 +759,11 @@ namespace sysstr::util::unicode {{2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}} }}; - const std::array case_prop_lookup::values = {{ + const std::array case_prop::values = {{ 0, 2, 1, 3 }}; - const std::array grapheme_cluster_break_lookup::entries = {{ + const std::array grapheme_cluster_break_prop::entries = {{ {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}, {{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}, {{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}, {{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}}, {{4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}}, {{5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}}, {{6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}}, {{7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}}, {{8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}}, @@ -919,18 +916,18 @@ namespace sysstr::util::unicode {{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2}}, {{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2}} }}; - const std::array grapheme_cluster_break_lookup::values = {{ + const std::array grapheme_cluster_break_prop::values = {{ 0, 1, 11, 34, 4, 5, 16, 50, 6, 7, 8, 2, 32, 9, 10, 3 }}; constexpr auto total_data_size = - sizeof(case_fold_data::source_chars) + sizeof(case_fold_data::chars) + - sizeof(to_lower_case_data::source_chars) + sizeof(to_lower_case_data::chars) + - sizeof(to_upper_case_data::source_chars) + sizeof(to_upper_case_data::chars) + - sizeof(whitespaces) + - case_prop_lookup::data_size + - grapheme_cluster_break_lookup::data_size; + case_fold_mapper::data_size + + to_lower_case_mapper::data_size + + to_upper_case_mapper::data_size + + is_whitespace::data_size + + case_prop::data_size + + grapheme_cluster_break_prop::data_size; static_assert(total_data_size == 54954); } diff --git a/unicode/scripts/genmappings.py b/unicode/scripts/genmappings.py index 680f052..b41a350 100644 --- a/unicode/scripts/genmappings.py +++ b/unicode/scripts/genmappings.py @@ -12,10 +12,12 @@ from mapping_builder import mapping_builder from table_builder import table_builder from trie_builder import trie_builder +from lookup_builder import lookup_builder + +PROP_COMPRESSION = 4 parser = argparse.ArgumentParser() -parser.add_argument('-c', '--comp', type=int, dest='compression', choices=range(0, 8), default=4) parser.add_argument('datadir') parser.add_argument('cppfile') parser.add_argument('hfile') @@ -28,21 +30,24 @@ hfile = Path(args.hfile) testfile = Path(args.testfile) -if args.compression != 0: - trie_builder.set_bits_per_fanout(8 - args.compression) +if PROP_COMPRESSION != 0: + trie_builder.set_bits_per_fanout(8 - PROP_COMPRESSION) + +def get_prop_builder_class(): + if PROP_COMPRESSION == 0: + return table_builder + return trie_builder def make_prop_builder(separate_values=False): - if args.compression == 0: + if PROP_COMPRESSION == 0: return table_builder(separate_values) return trie_builder() -total_data_size = 0 - folding_builder = mapping_builder() uppercase_builder = mapping_builder() lowercase_builder = mapping_builder() -whitespaces = [] +whitespaces = lookup_builder() case_prop_values = { 'Cased': (0b01, 'cased'), @@ -51,7 +56,7 @@ def make_prop_builder(separate_values=False): case_prop_builder = make_prop_builder() -grapheme_cluster_break_prop_values = { +grapheme_cluster_break_prop_prop_values = { 'Control': (1, 'control'), 'Extend': (2, 'extend'), 'Regional_Indicator': (3, 'regional_indicator'), @@ -79,7 +84,7 @@ def make_prop_builder(separate_values=False): 'in_cb_mask': 0x30 } -grapheme_cluster_break_prop_builder = make_prop_builder(separate_values=True) +grapheme_cluster_break_prop_prop_builder = make_prop_builder(separate_values=True) def parse_case_info(line): fields = line.split(';') @@ -118,8 +123,7 @@ def parse_properties(line): prop = prop.strip() if prop == 'White_Space': start, end = parse_char_range(char_range) - for char in range(start, end): - whitespaces.append(char) + whitespaces.add_chars(start, end) def parse_derived_properties(line): (char_range, props) = line[:line.index('# ')].split('; ', 1) @@ -132,17 +136,17 @@ def parse_derived_properties(line): case_prop_builder.add_chars(start, end, prop_val[0]) elif (prop_val := grapheme_related_prop_values.get(props)) is not None: start, end = parse_char_range(char_range) - grapheme_cluster_break_prop_builder.add_chars(start, end, prop_val[0]) + grapheme_cluster_break_prop_prop_builder.add_chars(start, end, prop_val[0]) -def parse_grapheme_cluster_break_properties(line): +def parse_grapheme_cluster_break_prop_properties(line): (char_range, prop) = line[:line.index('# ')].split('; ') char_range = char_range.strip() prop = prop.strip() - prop_val = grapheme_cluster_break_prop_values.get(prop) + prop_val = grapheme_cluster_break_prop_prop_values.get(prop) if not prop_val is None: start, end = parse_char_range(char_range) - grapheme_cluster_break_prop_builder.add_chars(start, end, prop_val[0]) + grapheme_cluster_break_prop_prop_builder.add_chars(start, end, prop_val[0]) def parse_emoji_data(line): (char_range, prop) = line[:line.index('# ')].split('; ') @@ -151,7 +155,7 @@ def parse_emoji_data(line): prop_val = grapheme_related_emoji_values.get(prop) if not prop_val is None: start, end = parse_char_range(char_range) - grapheme_cluster_break_prop_builder.add_chars(start, end, prop_val[0]) + grapheme_cluster_break_prop_prop_builder.add_chars(start, end, prop_val[0]) grapheme_tests = [] def parse_grapheme_tests(line): @@ -199,18 +203,6 @@ def print_enum(mappings, masks={}): ret += f'{name} = {val}' return ret -def make_whitespaces(): - global total_data_size - ret = '' - char_count = 0 - for char in whitespaces: - ret += char_name(char) - char_count += 1 - total_data_size += 2 if char < 0x10000 else 4 - if char_count > 0 and char_count % 16 == 0: - ret += '"\n u"' - total_data_size += 2 - return ret def make_grapheme_tests(): ret ='' @@ -229,15 +221,17 @@ def make_grapheme_tests(): read_ucd_file(datadir/'SpecialCasing.txt', parse_special_casing) read_ucd_file(datadir/'PropList.txt', parse_properties) read_ucd_file(datadir/'DerivedCoreProperties.txt', parse_derived_properties) -read_ucd_file(datadir/'GraphemeBreakProperty.txt', parse_grapheme_cluster_break_properties) +read_ucd_file(datadir/'GraphemeBreakProperty.txt', parse_grapheme_cluster_break_prop_properties) read_ucd_file(datadir/'emoji-data.txt', parse_emoji_data) read_ucd_file(datadir/'GraphemeBreakTest.txt', parse_grapheme_tests) +total_data_size = 0 total_data_size += folding_builder.generate() total_data_size += uppercase_builder.generate() total_data_size += lowercase_builder.generate() +total_data_size += whitespaces.generate() total_data_size += case_prop_builder.generate() -total_data_size += grapheme_cluster_break_prop_builder.generate() +total_data_size += grapheme_cluster_break_prop_prop_builder.generate() write_file(hfile, f'''//THIS FILE IS GENERATED. PLEASE DO NOT EDIT DIRECTLY @@ -248,37 +242,37 @@ def make_grapheme_tests(): // license that can be found in the LICENSE file or at // https://github.com/gershnik/sys_string/blob/master/LICENSE // -#ifndef HEADER_SYS_STRING_UNICODE_MAPPINGS_DATA_H_INCLUDED -#define HEADER_SYS_STRING_UNICODE_MAPPINGS_DATA_H_INCLUDED - +#ifndef HEADER_SYS_STRING_UNICODE_MAPPINGS_H_INCLUDED +#define HEADER_SYS_STRING_UNICODE_MAPPINGS_H_INCLUDED + +#include + +#include +#include +#include #include #include namespace sysstr::util::unicode {{ - struct mapper_data - {{ - static constexpr size_t max_mapped_length = {mapping_builder.max_len}; - }}; + {indent_insert(mapping_builder.print_common_header(), 4)} + + {indent_insert(folding_builder.print_header('case_fold_mapper'), 4)} - struct char_lookup - {{ - char32_t offset:11; - char32_t value:21; - }}; + {indent_insert(uppercase_builder.print_header('to_lower_case_mapper'), 4)} - {indent_insert(folding_builder.print_header('case_fold_data'), 4)} + {indent_insert(lowercase_builder.print_header('to_upper_case_mapper'), 4)} - {indent_insert(uppercase_builder.print_header('to_lower_case_data'), 4)} + {indent_insert(lookup_builder.print_common_header(), 4)} - {indent_insert(lowercase_builder.print_header('to_upper_case_data'), 4)} + {indent_insert(whitespaces.print_header('is_whitespace'), 4)} - {indent_insert(case_prop_builder.__class__.print_common_header(), 4)} + {indent_insert(get_prop_builder_class().print_common_header(), 4)} - {indent_insert(case_prop_builder.print_header("case_prop_lookup", print_enum(case_prop_values)), 4)} + {indent_insert(case_prop_builder.print_header("case_prop", print_enum(case_prop_values)), 4)} - {indent_insert(grapheme_cluster_break_prop_builder.print_header("grapheme_cluster_break_lookup", - print_enum((grapheme_cluster_break_prop_values, grapheme_related_emoji_values, grapheme_related_prop_values), + {indent_insert(grapheme_cluster_break_prop_prop_builder.print_header("grapheme_cluster_break_prop", + print_enum((grapheme_cluster_break_prop_prop_values, grapheme_related_emoji_values, grapheme_related_prop_values), grapheme_masks)), 4)} }} @@ -303,25 +297,20 @@ def make_grapheme_tests(): namespace sysstr::util::unicode {{ - {indent_insert(folding_builder.print_impl('case_fold_data'), 4)} - - {indent_insert(uppercase_builder.print_impl('to_lower_case_data'), 4)} - - {indent_insert(lowercase_builder.print_impl('to_upper_case_data'), 4)} - - extern const char16_t whitespaces[] = - u"{make_whitespaces()}"; - - {indent_insert(case_prop_builder.print_impl("case_prop_lookup"), 4)} - {indent_insert(grapheme_cluster_break_prop_builder.print_impl("grapheme_cluster_break_lookup"), 4)} + {indent_insert(folding_builder.print_impl('case_fold_mapper'), 4)} + {indent_insert(uppercase_builder.print_impl('to_lower_case_mapper'), 4)} + {indent_insert(lowercase_builder.print_impl('to_upper_case_mapper'), 4)} + {indent_insert(whitespaces.print_impl('is_whitespace'), 4)} + {indent_insert(case_prop_builder.print_impl("case_prop"), 4)} + {indent_insert(grapheme_cluster_break_prop_prop_builder.print_impl("grapheme_cluster_break_prop"), 4)} constexpr auto total_data_size = - sizeof(case_fold_data::source_chars) + sizeof(case_fold_data::chars) + - sizeof(to_lower_case_data::source_chars) + sizeof(to_lower_case_data::chars) + - sizeof(to_upper_case_data::source_chars) + sizeof(to_upper_case_data::chars) + - sizeof(whitespaces) + - case_prop_lookup::data_size + - grapheme_cluster_break_lookup::data_size; + case_fold_mapper::data_size + + to_lower_case_mapper::data_size + + to_upper_case_mapper::data_size + + is_whitespace::data_size + + case_prop::data_size + + grapheme_cluster_break_prop::data_size; static_assert(total_data_size == {total_data_size}); }} diff --git a/unicode/scripts/lookup_builder.py b/unicode/scripts/lookup_builder.py new file mode 100644 index 0000000..420e2e8 --- /dev/null +++ b/unicode/scripts/lookup_builder.py @@ -0,0 +1,81 @@ +# +# Copyright 2020 Eugene Gershnik +# +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file or at +# https://github.com/gershnik/sys_string/blob/master/LICENSE +# + +from textwrap import dedent +from common import char_name, indent_insert + +class lookup_builder: + + def __init__(self): + self.chars = [] + self.max_char = 0 + self.length = 0 + + def add_chars(self, start, end): + for char in range(start, end): + self.chars.append(char) + + def generate(self): + for char in self.chars: + self.length += 1 if char < 0x10000 else 2 + self.max_char = max(self.max_char, char) + self.length += 1 + return self.length * 2 + + def __make_content(self): + ret = 'u"' + char_count = 0 + for char in self.chars: + ret += char_name(char) + char_count += 1 + if char_count > 0 and char_count % 16 == 0: + ret += '"\nu"' + ret += '"' + return ret + + @staticmethod + def print_common_header(): + ret = ''' + template + class lookup + { + public: + static bool test(char32_t c) noexcept + { + if (c > Derived::max_char) + return false; + for(auto p = Derived::chars; *p; ++p) + if (*p == c) + return true; + return false; + } + }; + ''' + return dedent(ret) + + def print_header(self, name): + ret = f''' + class {name} : public lookup<{name}> + {{ + friend lookup<{name}>; + private: + static const char16_t chars[{self.length}]; + public: + static constexpr char32_t max_char = U'{char_name(self.max_char)}'; + + static constexpr size_t data_size = sizeof(chars); + }}; + ''' + return dedent(ret) + + def print_impl(self, name): + ret = f''' + const char16_t {name}::chars[] = + {indent_insert(self.__make_content(), 12)}; + ''' + return dedent(ret) diff --git a/unicode/scripts/mapping_builder.py b/unicode/scripts/mapping_builder.py index 4889f48..f22d726 100644 --- a/unicode/scripts/mapping_builder.py +++ b/unicode/scripts/mapping_builder.py @@ -10,14 +10,17 @@ from common import char_name, indent_insert class mapping_builder: - max_len = 1 + def __init__(self) -> None: self.mapping = {} + self.max_mapped_len = 1 + self.total_mapped_len = 0 + self.sorted_keys = None def set_values(self, char, values): self.mapping[char] = values - mapping_builder.max_len = max(mapping_builder.max_len, len(values)) + self.max_mapped_len = max(self.max_mapped_len, len(values)) def generate(self): self.sorted_keys = sorted(self.mapping.keys()) @@ -25,7 +28,12 @@ def generate(self): for values in self.mapping.values(): ret += 4 # for source for value in values: - ret += 2 if value < 0x10000 else 4 + if value < 0x10000: + self.total_mapped_len += 1 + ret += 2 + else: + self.total_mapped_len += 2 + ret += 4 ret += (4 + 2) return ret @@ -62,25 +70,99 @@ def make_values_string(self): ret += '"\nu"' ret += '"' return ret + + @staticmethod + def print_common_header(): + ret = ''' + struct char_lookup + { + char32_t offset:11; + char32_t value:21; + }; + + template + class mapper + { + public: + template + static auto map_char(char32_t src, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt + { + const char_lookup * const lookup_start = Derived::source_chars.data(); + const char_lookup * const lookup_end = lookup_start + Derived::source_chars.size() - 1; + const char16_t * const mapped = Derived::mapped_chars; + + auto lower = std::lower_bound(lookup_start, lookup_end, char_lookup{0, src}, [](char_lookup lhs, char_lookup rhs) { + return lhs.value < rhs.value; + }); + if (lower == lookup_end || lower->value != src) + return write(src, dest); + auto start = lower->offset; + auto end = (++lower)->offset; //safe - there is one behind end + return write(mapped + start, mapped + end, dest); + } + private: + template + static auto write(char32_t c, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt + { + if constexpr (Enc == utf32) + { + *dest++ = c; + return dest; + } + else + { + utf_codepoint_encoder encoder; + encoder.put(c); + return std::copy(encoder.begin(), encoder.end(), dest); + } + } + + template + static auto write(const char16_t * begin, const char16_t * end, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of())) -> OutIt + { + if constexpr (Enc == utf16) + { + return std::copy(begin ,end, dest); + } + else + { + utf_codepoint_decoder decoder; + while(begin != end) + { + decoder.put(*begin++); + if (!decoder.done()) + decoder.put(*begin++); //no need to bounds check, we know end is good + dest = write(decoder.value(), dest); + } + return dest; + } + } + }; + ''' + return dedent(ret) def print_header(self, name): ret = f''' - struct {name} + class {name} : public mapper<{name}> {{ - static const char_lookup source_chars[]; - static constexpr size_t source_chars_len = {len(self.mapping)}; - static const char16_t chars[]; + friend mapper<{name}>; + private: + static const std::array source_chars; + static const char16_t mapped_chars[{self.total_mapped_len + 1}]; + public: + static constexpr size_t max_mapped_length = {self.max_mapped_len}; + static constexpr size_t data_size = sizeof(source_chars) + sizeof(mapped_chars); }}; ''' return dedent(ret) def print_impl(self, name): ret = f''' - const char_lookup {name}::source_chars[] = {{ + const std::array {name}::source_chars = {{{{ {indent_insert(self.make_source_chars(), 16)} - }}; + }}}}; - const char16_t {name}::chars[] = + const char16_t {name}::mapped_chars[] = {indent_insert(self.make_values_string(), 16)} ; '''