Skip to content

Commit

Permalink
Cleaning up unicode file generation
Browse files Browse the repository at this point in the history
  • Loading branch information
gershnik committed Jan 7, 2025
1 parent f674707 commit 4c92b74
Show file tree
Hide file tree
Showing 9 changed files with 480 additions and 345 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ if(${Python3_Interpreter_FOUND})

set(UNICODE_GENERATED_FILES
${CMAKE_CURRENT_LIST_DIR}/lib/src/unicode_mappings.cpp
${CMAKE_CURRENT_LIST_DIR}/lib/inc/sys_string/impl/unicode/mappings_params.h
${CMAKE_CURRENT_LIST_DIR}/lib/inc/sys_string/impl/unicode/mappings.h
${CMAKE_CURRENT_LIST_DIR}/test/test_grapheme_data.h
)

Expand Down
1 change: 0 additions & 1 deletion lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ set(UNICODE_FILES
${SRCDIR}/inc/sys_string/impl/unicode/utf_util.h
${SRCDIR}/inc/sys_string/impl/unicode/algorithms.h
${SRCDIR}/inc/sys_string/impl/unicode/mappings.h
${SRCDIR}/inc/sys_string/impl/unicode/mappings_params.h
)
set(SOURCES
${SRCDIR}/src/unicode_mappings.cpp
Expand Down
10 changes: 5 additions & 5 deletions lib/inc/sys_string/impl/unicode/algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,19 @@ namespace sysstr
{
auto operator()(char32_t c) const noexcept -> bool
{
return util::unicode::isspace(c);
return util::unicode::is_whitespace::test(c);
}
};

template<utf_encoding OutEnc>
struct casefold
{
static constexpr auto max_output_length = util::unicode::mapper::max_mapped_length;
static constexpr auto max_output_length = util::unicode::case_fold_mapper::max_mapped_length;

template<std::output_iterator<utf_char_of<OutEnc>> OutIt>
auto operator()(char32_t c, OutIt dest) const noexcept(noexcept(*dest++ = utf_char_of<OutEnc>())) -> OutIt
{
return util::unicode::mapper::case_fold.map_char<OutEnc>(c, dest);
return util::unicode::case_fold_mapper::map_char<OutEnc>(c, dest);
}
};

Expand Down Expand Up @@ -132,7 +132,7 @@ namespace sysstr
auto c = *first;
if (c != U'\u03A3') // not Σ
{
dest = mapper::to_lower_case.map_char<OutEnc>(c, dest);
dest = to_lower_case_mapper::map_char<OutEnc>(c, dest);
}
else
{
Expand Down Expand Up @@ -161,7 +161,7 @@ namespace sysstr
for( ; first != last; ++first)
{
auto c = *first;
dest = mapper::to_upper_case.map_char<OutEnc>(c, dest);
dest = to_upper_case_mapper::map_char<OutEnc>(c, dest);
}

return dest;
Expand Down
299 changes: 226 additions & 73 deletions lib/inc/sys_string/impl/unicode/mappings.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//THIS FILE IS GENERATED. PLEASE DO NOT EDIT DIRECTLY

//
// Copyright 2020 Eugene Gershnik
//
Expand All @@ -9,105 +11,256 @@
#define HEADER_SYS_STRING_UNICODE_MAPPINGS_H_INCLUDED

#include <sys_string/impl/unicode/utf_encoding.h>

#include <sys_string/impl/unicode/mappings_params.h>


#include <algorithm>
#include <cstdlib>
#include <climits>
#include <array>
#include <cassert>

namespace sysstr::util
namespace sysstr::util::unicode
{
namespace unicode
struct char_lookup
{
char32_t offset:11;
char32_t value:21;
};

template<class Derived>
class mapper
{
class mapper : public mapper_data
public:
template<utf_encoding Enc, class OutIt>
static auto map_char(char32_t src, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of<Enc>())) -> OutIt
{
public:
static const mapper case_fold;
static const mapper to_lower_case;
static const mapper to_upper_case;

template<utf_encoding Enc, class OutIt>
auto map_char(char32_t src, OutIt dest) const noexcept(noexcept(*dest++ = utf_char_of<Enc>())) -> OutIt
const char_lookup * const lookup_start = Derived::source_chars.data();
const char_lookup * const lookup_end = lookup_start + Derived::source_chars.size() - 1;
const char16_t * const mapped = Derived::mapped_chars;

auto lower = std::lower_bound(lookup_start, lookup_end, char_lookup{0, src}, [](char_lookup lhs, char_lookup rhs) {
return lhs.value < rhs.value;
});
if (lower == lookup_end || lower->value != src)
return write<Enc>(src, dest);
auto start = lower->offset;
auto end = (++lower)->offset; //safe - there is one behind end
return write<Enc>(mapped + start, mapped + end, dest);
}
private:
template<utf_encoding Enc, class OutIt>
static auto write(char32_t c, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of<Enc>())) -> OutIt
{
if constexpr (Enc == utf32)
{
auto lower = std::lower_bound(m_lookup_start, m_lookup_end, char_lookup{0, src}, [](char_lookup lhs, char_lookup rhs) {
return lhs.value < rhs.value;
});
if (lower == m_lookup_end || lower->value != src)
return write<Enc>(src, dest);
auto start = lower->offset;
auto end = (++lower)->offset; //safe - there is one behind end
return write<Enc>(m_mapped + start, m_mapped + end, dest);
*dest++ = c;
return dest;
}
private:
constexpr mapper(const char_lookup * lookup_start, size_t lookup_len, const char16_t * mapped) noexcept:
m_lookup_start(lookup_start),
m_lookup_end(lookup_start + lookup_len - 1),
m_mapped(mapped)
{}
template<utf_encoding Enc, class OutIt>
auto write(char32_t c, OutIt dest) const noexcept(noexcept(*dest++ = utf_char_of<Enc>())) -> OutIt
else
{
if constexpr (Enc == utf32)
{
*dest++ = c;
return dest;
}
else
{
utf_codepoint_encoder<Enc, false> encoder;
encoder.put(c);
return std::copy(encoder.begin(), encoder.end(), dest);
}
utf_codepoint_encoder<Enc, false> encoder;
encoder.put(c);
return std::copy(encoder.begin(), encoder.end(), dest);
}
template<utf_encoding Enc, class OutIt>
auto write(const char16_t * begin, const char16_t * end, OutIt dest) const noexcept(noexcept(*dest++ = utf_char_of<Enc>())) -> OutIt
}

template<utf_encoding Enc, class OutIt>
static auto write(const char16_t * begin, const char16_t * end, OutIt dest) noexcept(noexcept(*dest++ = utf_char_of<Enc>())) -> OutIt
{
if constexpr (Enc == utf16)
{
if constexpr (Enc == utf16)
{
return std::copy(begin ,end, dest);
}
else
return std::copy(begin ,end, dest);
}
else
{
utf_codepoint_decoder<utf16> decoder;
while(begin != end)
{
utf_codepoint_decoder<utf16> decoder;
while(begin != end)
{
decoder.put(*begin++);
if (!decoder.done())
decoder.put(m_mapped[*begin++]); //no need to bounds check, we know end is good
dest = write<Enc>(decoder.value(), dest);
}
return dest;
decoder.put(*begin++);
if (!decoder.done())
decoder.put(*begin++); //no need to bounds check, we know end is good
dest = write<Enc>(decoder.value(), dest);
}
return dest;
}
private:
const char_lookup * m_lookup_start;
const char_lookup * m_lookup_end;
const char16_t * m_mapped;
};
}
};

const inline mapper mapper::case_fold{case_fold_data::source_chars, case_fold_data::source_chars_len, case_fold_data::chars};
const inline mapper mapper::to_lower_case(to_lower_case_data::source_chars, to_lower_case_data::source_chars_len, to_lower_case_data::chars);
const inline mapper mapper::to_upper_case(to_upper_case_data::source_chars, to_upper_case_data::source_chars_len, to_upper_case_data::chars);

class case_fold_mapper : public mapper<case_fold_mapper>
{
friend mapper<case_fold_mapper>;
private:
static const std::array<char_lookup, 1558> source_chars;
static const char16_t mapped_chars[1960];
public:
static constexpr size_t max_mapped_length = 3;
static constexpr size_t data_size = sizeof(source_chars) + sizeof(mapped_chars);
};


class to_lower_case_mapper : public mapper<to_lower_case_mapper>
{
friend mapper<to_lower_case_mapper>;
private:
static const std::array<char_lookup, 1461> source_chars;
static const char16_t mapped_chars[1744];
public:
static constexpr size_t max_mapped_length = 2;
static constexpr size_t data_size = sizeof(source_chars) + sizeof(mapped_chars);
};

inline bool isspace(char32_t c) noexcept

class to_upper_case_mapper : public mapper<to_upper_case_mapper>
{
friend mapper<to_upper_case_mapper>;
private:
static const std::array<char_lookup, 1553> source_chars;
static const char16_t mapped_chars[1953];
public:
static constexpr size_t max_mapped_length = 3;
static constexpr size_t data_size = sizeof(source_chars) + sizeof(mapped_chars);
};


template<class Derived>
class lookup
{
public:
static bool test(char32_t c) noexcept
{
extern const char16_t whitespaces[];
if (c > 0xFFFFu)
if (c > Derived::max_char)
return false;
for(auto p = whitespaces; *p; ++p)
for(auto p = Derived::chars; *p; ++p)
if (*p == c)
return true;
return false;
}
};



using case_prop = case_prop_lookup;
using grapheme_cluster_break_prop = grapheme_cluster_break_lookup;
}
}
class is_whitespace : public lookup<is_whitespace>
{
friend lookup<is_whitespace>;
private:
static const char16_t chars[26];
public:
static constexpr char32_t max_char = U'\u3000';

static constexpr size_t data_size = sizeof(chars);
};


template<class Derived>
class prop_lookup
{
public:
static auto get(char32_t c) noexcept
{
size_t idx = Derived::values.size();

{
int char_idx = (c >> 20) & 0xF;
auto & entry = Derived::entries[idx];
idx = entry[char_idx];
}

{
int char_idx = (c >> 16) & 0xF;
auto & entry = Derived::entries[idx];
idx = entry[char_idx];
}

{
int char_idx = (c >> 12) & 0xF;
auto & entry = Derived::entries[idx];
idx = entry[char_idx];
}

{
int char_idx = (c >> 8) & 0xF;
auto & entry = Derived::entries[idx];
idx = entry[char_idx];
}

{
int char_idx = (c >> 4) & 0xF;
auto & entry = Derived::entries[idx];
idx = entry[char_idx];
}

{
int char_idx = (c >> 0) & 0xF;
auto & entry = Derived::entries[idx];
idx = entry[char_idx];
}

assert(idx < Derived::values.size());
return typename Derived::value(Derived::values[idx]);
}
};


class case_prop : public prop_lookup<case_prop>
{
friend prop_lookup<case_prop>;
private:
using entry_type = std::array<uint16_t, 16>;
using value_type = uint8_t;

static const std::array<entry_type, 373> entries;

static const std::array<value_type, 4> values;

public:
enum value : value_type
{
none = 0,
cased = 1,
case_ignorable = 2
};

static constexpr size_t data_size = sizeof(entries) + sizeof(values);
};


class grapheme_cluster_break_prop : public prop_lookup<grapheme_cluster_break_prop>
{
friend prop_lookup<grapheme_cluster_break_prop>;
private:
using entry_type = std::array<uint16_t, 16>;
using value_type = uint8_t;

static const std::array<entry_type, 417> entries;

static const std::array<value_type, 16> values;

public:
enum value : value_type
{
none = 0,
control = 1,
extend = 2,
regional_indicator = 3,
prepend = 4,
spacing_mark = 5,
hangul_l = 6,
hangul_v = 7,
hangul_t = 8,
hangul_lv = 9,
hangul_lvt = 10,
extended_pictographic = 11,
in_cb_consonant = 16,
in_cb_extend = 32,
in_cb_linker = 48,

basic_mask = 15,
in_cb_mask = 48
};

static constexpr size_t data_size = sizeof(entries) + sizeof(values);
};

}

#endif

Loading

0 comments on commit 4c92b74

Please sign in to comment.