From 8bae05c46f5dd03cf645a5bba806362de306c7b0 Mon Sep 17 00:00:00 2001 From: actboy168 Date: Wed, 10 Apr 2024 23:34:17 +0800 Subject: [PATCH] support wtf-8 --- 3rd/lua/utf8_crt.c | 131 ++++++++++++++----------- bee/error.cpp | 4 +- bee/filewatch/filewatch_win.cpp | 6 +- bee/net/socket.cpp | 6 +- bee/platform/win/wtf8.h | 31 ++++++ bee/platform/win/wtf8_c.h | 153 ++++++++++++++++++++++++++++++ bee/subprocess/subprocess_win.cpp | 4 +- bee/thread/atomic_semaphore.h | 2 +- bee/thread/setname.cpp | 4 +- binding/lua_filesystem.cpp | 23 +++-- binding/lua_subprocess.cpp | 4 +- binding/port/lua_windows.cpp | 3 +- bootstrap/main.cpp | 17 ++-- compile/common.lua | 1 + compile/lua.lua | 1 + 15 files changed, 301 insertions(+), 89 deletions(-) create mode 100644 bee/platform/win/wtf8.h create mode 100644 bee/platform/win/wtf8_c.h diff --git a/3rd/lua/utf8_crt.c b/3rd/lua/utf8_crt.c index 491decce..143cf8b6 100644 --- a/3rd/lua/utf8_crt.c +++ b/3rd/lua/utf8_crt.c @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -19,50 +20,58 @@ # error Cannot define thread_local #endif -wchar_t* u2w(const char* str) { - int len = 0; - int out_len = 0; - wchar_t* buf = NULL; +struct u2w_result { + wchar_t* wstr; + size_t wlen; +}; + +static struct u2w_result u2w_r(const char* str, size_t len) { + struct u2w_result res = { NULL, 0 }; if (!str) { - return NULL; - } - len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0); - if (!len) { - return NULL; + return res; } - buf = (wchar_t*)calloc(len, sizeof(wchar_t)); - if (!buf) { - return NULL; + size_t wlen = wtf8_to_utf16_length(str, len); + if (wlen == (size_t)-1) { + return res; } - out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len); - if (out_len < 0) { - free(buf); - return NULL; + res.wstr = (wchar_t*)calloc(wlen + 1, sizeof(wchar_t)); + if (!res.wstr) { + return res; } - return buf; + res.wlen = wlen; + wtf8_to_utf16(str, len, res.wstr, res.wlen); + return res; } -char* w2u(const wchar_t* str) { - int len = 0; - int out_len = 0; - char* buf = NULL; +wchar_t* u2w(const char* str) { if (!str) { return NULL; } - len = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL); - if (!len) { + size_t len = strlen(str); // TODO + size_t wlen = wtf8_to_utf16_length(str, len); + if (wlen == (size_t)-1) { + return NULL; + } + wchar_t* wresult = (wchar_t*)calloc(wlen + 1, sizeof(wchar_t)); + if (!wresult) { return NULL; } - buf = (char*)calloc(len, sizeof(char)); - if (!buf) { + wtf8_to_utf16(str, len, wresult, wlen); + return wresult; +} + +char* w2u(const wchar_t* wstr) { + if (!wstr) { return NULL; } - out_len = WideCharToMultiByte(CP_UTF8, 0, str, -1, buf, len, NULL, NULL); - if (out_len < 0) { - free(buf); + size_t wlen = wcslen(wstr); // TODO + size_t len = wtf8_from_utf16_length(wstr, wlen); + char* result = (char*)calloc(len + 1, sizeof(char)); + if (!result) { return NULL; } - return buf; + wtf8_from_utf16(wstr, wlen, result); + return result; } FILE* __cdecl utf8_fopen(const char* filename, const char* mode) { @@ -140,10 +149,8 @@ char* __cdecl utf8_tmpnam(char* buffer) { if (!_wtmpnam(tmp)) { return NULL; } - unsigned long ret = WideCharToMultiByte(CP_UTF8, 0, tmp, -1, buffer, L_tmpnam, NULL, NULL); - if (ret == 0) { - return NULL; - } + size_t wlen = wcslen(tmp); + wtf8_from_utf16(tmp, wlen, buffer); return buffer; } @@ -160,10 +167,20 @@ unsigned long __stdcall utf8_GetModuleFileNameA(void* module, char* filename, un SetLastError(ERROR_NOT_ENOUGH_MEMORY); return 0; } - unsigned long tmplen = GetModuleFileNameW(module, tmp, size); - unsigned long ret = WideCharToMultiByte(CP_UTF8, 0, tmp, tmplen + 1, filename, size, NULL, NULL); + DWORD tmplen = GetModuleFileNameW(module, tmp, size); + if (tmplen == 0) { + free(tmp); + return 0; + } + size_t len = wtf8_from_utf16_length(tmp, tmplen); + if (len > size) { + free(tmp); + SetLastError(ERROR_NOT_ENOUGH_MEMORY); + return 0; + } + wtf8_from_utf16(tmp, tmplen, filename); free(tmp); - return ret - 1; + return (unsigned long)len; } unsigned long __stdcall utf8_FormatMessageA( @@ -180,14 +197,20 @@ unsigned long __stdcall utf8_FormatMessageA( SetLastError(ERROR_NOT_ENOUGH_MEMORY); return 0; } - int res = FormatMessageW(dwFlags, lpSource, dwMessageId, dwLanguageId, tmp, nSize, Arguments); - if (!res) { + DWORD tmplen = FormatMessageW(dwFlags, lpSource, dwMessageId, dwLanguageId, tmp, nSize, Arguments); + if (tmplen == 0) { free(tmp); - return res; + return 0; + } + size_t len = wtf8_from_utf16_length(tmp, tmplen); + if (len > nSize) { + free(tmp); + SetLastError(ERROR_NOT_ENOUGH_MEMORY); + return 0; } - int ret = WideCharToMultiByte(CP_UTF8, 0, tmp, -1, lpBuffer, nSize, NULL, NULL); + wtf8_from_utf16(tmp, tmplen, lpBuffer); free(tmp); - return ret; + return (unsigned long)len; } static void ConsoleWrite(FILE* stream, const char* s, int l) { @@ -196,21 +219,17 @@ static void ConsoleWrite(FILE* stream, const char* s, int l) { fwrite(s, sizeof(char), l, stream); return; } - int wsz = MultiByteToWideChar(CP_UTF8, 0, s, l, NULL, 0); - if (wsz > 0) { - wchar_t* wmsg = (wchar_t*)calloc(wsz, sizeof(wchar_t)); - if (wmsg) { - wsz = MultiByteToWideChar(CP_UTF8, 0, s, l, wmsg, wsz); - if (wsz > 0) { - if (WriteConsoleW(handle, wmsg, wsz, NULL, NULL)) { - free(wmsg); - return; - } - } - free(wmsg); - } - } - fwrite(s, sizeof(char), l, stream); + struct u2w_result r = u2w_r(s, l); + if (!r.wstr) { + fwrite(s, sizeof(char), l, stream); + return; + } + if (!WriteConsoleW(handle, r.wstr, (DWORD)r.wlen, NULL, NULL)) { + free(r.wstr); + fwrite(s, sizeof(char), l, stream); + return; + } + free(r.wstr); } void utf8_ConsoleWrite(const char* s, int l) { diff --git a/bee/error.cpp b/bee/error.cpp index 96d300a6..faf0348d 100644 --- a/bee/error.cpp +++ b/bee/error.cpp @@ -3,7 +3,7 @@ #if defined(_WIN32) # include -# include +# include #else # include #endif @@ -47,7 +47,7 @@ namespace bee { return "Windows"; } std::string message(int error_code) const override { - return win::w2u(error_message(error_code)); + return wtf8::w2u(error_message(error_code)); } std::error_condition default_error_condition(int error_code) const noexcept override { const std::error_condition cond = std::system_category().default_error_condition(error_code); diff --git a/bee/filewatch/filewatch_win.cpp b/bee/filewatch/filewatch_win.cpp index 6d4a1b0a..a0efaed2 100644 --- a/bee/filewatch/filewatch_win.cpp +++ b/bee/filewatch/filewatch_win.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include @@ -189,13 +189,13 @@ namespace bee::filewatch { path = task.path() + path; switch (fni.Action) { case FILE_ACTION_MODIFIED: - m_notify.emplace(notify::flag::modify, win::w2u(path)); + m_notify.emplace(notify::flag::modify, wtf8::w2u(path)); break; case FILE_ACTION_ADDED: case FILE_ACTION_REMOVED: case FILE_ACTION_RENAMED_OLD_NAME: case FILE_ACTION_RENAMED_NEW_NAME: - m_notify.emplace(notify::flag::rename, win::w2u(path)); + m_notify.emplace(notify::flag::rename, wtf8::w2u(path)); break; default: std::unreachable(); diff --git a/bee/net/socket.cpp b/bee/net/socket.cpp index 5328a5f1..8231b4f6 100644 --- a/bee/net/socket.cpp +++ b/bee/net/socket.cpp @@ -3,8 +3,8 @@ # include // clang-format on # include -# include # include +# include # include # include # include @@ -43,7 +43,7 @@ namespace bee::net::socket { static_assert(sizeof(SOCKET) == sizeof(fd_t)); # if defined(_MSC_VER) -# define FILENAME(n) win::u2w(n) +# define FILENAME(n) wtf8::u2w(n) # else # define FILENAME(n) (n) # endif @@ -556,7 +556,7 @@ namespace bee::net::socket { return false; } #if defined(_WIN32) - return win::unlink(win::u2w(path).c_str()); + return win::unlink(wtf8::u2w(path).c_str()); #else return 0 == ::unlink(path.c_str()); #endif diff --git a/bee/platform/win/wtf8.h b/bee/platform/win/wtf8.h new file mode 100644 index 00000000..7ae2b5f9 --- /dev/null +++ b/bee/platform/win/wtf8.h @@ -0,0 +1,31 @@ +#pragma once + +extern "C" { +#include +} +#include + +namespace bee::wtf8 { + inline std::wstring u2w(zstring_view str) noexcept { + if (str.empty()) { + return L""; + } + size_t wlen = wtf8_to_utf16_length(str.data(), str.size()); + if (wlen == (size_t)-1) { + return L""; + } + std::wstring wresult(wlen, L'\0'); + wtf8_to_utf16(str.data(), str.size(), wresult.data(), wlen); + return wresult; + } + + inline std::string w2u(wzstring_view wstr) noexcept { + if (wstr.empty()) { + return ""; + } + size_t len = wtf8_from_utf16_length(wstr.data(), wstr.size()); + std::string result(len, '\0'); + wtf8_from_utf16(wstr.data(), wstr.size(), result.data()); + return result; + } +} diff --git a/bee/platform/win/wtf8_c.h b/bee/platform/win/wtf8_c.h new file mode 100644 index 00000000..a10d8a72 --- /dev/null +++ b/bee/platform/win/wtf8_c.h @@ -0,0 +1,153 @@ +#pragma once + +#include +#include +#include + +inline uint8_t wtf8_decode(const char* input, uint32_t* res) { + uint8_t b1 = input[0]; + if (b1 <= 0x7F) { + *res = b1; + return 1; + } + if (b1 < 0xC2) { + return 0; + } + uint32_t code_point = b1; + uint8_t b2 = input[1]; + if ((b2 & 0xC0) != 0x80) { + return 0; + } + code_point = (code_point << 6) | (b2 & 0x3F); + if (b1 <= 0xDF) { + *res = 0x7FF & code_point; + return 2; + } + + uint8_t b3 = input[2]; + if ((b3 & 0xC0) != 0x80) { + return 0; + } + code_point = (code_point << 6) | (b3 & 0x3F); + if (b1 <= 0xEF) { + *res = 0xFFFF & code_point; + return 3; + } + + uint8_t b4 = input[3]; + if ((b4 & 0xC0) != 0x80) { + return 0; + } + code_point = (code_point << 6) | (b4 & 0x3F); + if (b1 <= 0xF4) { + code_point &= 0x1FFFFF; + if (code_point <= 0x10FFFF) { + *res = code_point; + return 4; + } + } + return 0; +} + +inline size_t wtf8_to_utf16_length(const char* input, size_t length) { + size_t output_len = 0; + uint32_t code_point; + for (size_t i = 0; i < length;) { + uint8_t n = wtf8_decode(&input[i], &code_point); + if (n == 0) { + return (size_t)-1; + } + if (code_point > 0xFFFF) { + output_len += 2; + } + else { + output_len += 1; + } + i += n; + } + return output_len; +} + +inline void wtf8_to_utf16(const char* input, size_t length, wchar_t* output, size_t output_len) { + uint32_t code_point; + for (size_t i = 0; i < length;) { + uint8_t n = wtf8_decode(&input[i], &code_point); + assert(n > 0); + if (code_point > 0x10000) { + assert(code_point < 0x10FFFF); + *output++ = (((code_point - 0x10000) >> 10) + 0xD800); + *output++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00; + output_len -= 2; + } + else { + *output++ = code_point; + output_len -= 1; + } + i += n; + } + (void)output_len; + assert(output_len == 0); +} + +inline uint32_t wtf8_surrogate(const wchar_t* input, bool eof) { + uint32_t u = input[0]; + if (u >= 0xD800 && u <= 0xDBFF && !eof) { + uint32_t next = input[1]; + if (next >= 0xDC00 && next <= 0xDFFF) { + return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00); + } + } + return u; +} + +inline size_t wtf8_from_utf16_length(const wchar_t* input, size_t length) { + size_t output_len = 0; + for (size_t i = 0; i < length; ++i) { + uint32_t code_point = wtf8_surrogate(&input[i], length == i + 1); + if (code_point == 0) { + break; + } + if (code_point < 0x80) { + output_len += 1; + } + else if (code_point < 0x800) { + output_len += 2; + } + else if (code_point < 0x10000) { + output_len += 3; + } + else { + output_len += 4; + i++; + } + } + return output_len; +} + +inline void wtf8_from_utf16(const wchar_t* input, size_t length, char* output) { + for (size_t i = 0; i < length; ++i) { + uint32_t code_point = wtf8_surrogate(&input[i], length == i + 1); + if (code_point == 0) { + break; + } + if (code_point < 0x80) { + *output++ = code_point; + } + else if (code_point < 0x800) { + *output++ = 0xC0 | (code_point >> 6); + *output++ = 0x80 | (code_point & 0x3F); + } + else if (code_point < 0x10000) { + *output++ = 0xE0 | (code_point >> 12); + *output++ = 0x80 | ((code_point >> 6) & 0x3F); + *output++ = 0x80 | (code_point & 0x3F); + } + else { + *output++ = 0xF0 | (code_point >> 18); + *output++ = 0x80 | ((code_point >> 12) & 0x3F); + *output++ = 0x80 | ((code_point >> 6) & 0x3F); + *output++ = 0x80 | (code_point & 0x3F); + i++; + } + } +} diff --git a/bee/subprocess/subprocess_win.cpp b/bee/subprocess/subprocess_win.cpp index c51cf757..fc9630df 100644 --- a/bee/subprocess/subprocess_win.cpp +++ b/bee/subprocess/subprocess_win.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include @@ -17,7 +17,7 @@ namespace bee::subprocess { void args_t::push(zstring_view v) noexcept { - data_.emplace_back(win::u2w(v)); + data_.emplace_back(wtf8::u2w(v)); } void args_t::push(const std::wstring& v) noexcept { data_.emplace_back(v); diff --git a/bee/thread/atomic_semaphore.h b/bee/thread/atomic_semaphore.h index 147ac579..c423d4d3 100644 --- a/bee/thread/atomic_semaphore.h +++ b/bee/thread/atomic_semaphore.h @@ -131,7 +131,7 @@ namespace bee { # elif defined(BEE_USE_ULOCK) using value_type = uint64_t; # else - using value_type = uint8_t; + using value_type = uint8_t; # endif public: constexpr explicit atomic_semaphore(const ptrdiff_t desired) noexcept diff --git a/bee/thread/setname.cpp b/bee/thread/setname.cpp index c0cb98b5..2b68ebdc 100644 --- a/bee/thread/setname.cpp +++ b/bee/thread/setname.cpp @@ -2,7 +2,7 @@ #if defined(_WIN32) # include -# include +# include #else # include # if defined(__linux__) @@ -47,7 +47,7 @@ namespace bee { using SetThreadDescriptionProc = HRESULT(WINAPI*)(HANDLE, PCWSTR); if (HMODULE kernel32 = GetModuleHandleW(L"kernel32.dll")) { if (SetThreadDescriptionProc SetThreadDescription = (SetThreadDescriptionProc)GetProcAddress(kernel32, "SetThreadDescription")) { - SetThreadDescription(GetCurrentThread(), win::u2w(name).c_str()); + SetThreadDescription(GetCurrentThread(), wtf8::u2w(name).c_str()); } } # if defined(_MSC_VER) diff --git a/binding/lua_filesystem.cpp b/binding/lua_filesystem.cpp index 9a03a1a2..8c6ffa33 100644 --- a/binding/lua_filesystem.cpp +++ b/binding/lua_filesystem.cpp @@ -14,7 +14,7 @@ #include #if defined(_WIN32) -# include +# include #endif #if defined(__NetBSD__) || defined(__FreeBSD__) || defined(__OpenBSD__) @@ -88,6 +88,15 @@ namespace bee::lua_filesystem { #endif namespace bee::lua_filesystem { + static std::string tostring(const fs::path& path) { +#if defined(_WIN32) + auto wstr = path.generic_wstring(); + return wtf8::w2u(wstr); +#else + return path.generic_string(); +#endif + } + template static std::string_view u8tostrview(const std::basic_string& u8str) { static_assert(sizeof(CharT) == sizeof(char)); @@ -106,12 +115,12 @@ namespace bee::lua_filesystem { } [[nodiscard]] static lua::cxx::status pusherror(lua_State* L, std::string_view op, std::error_code ec, const fs::path& path1) { - lua_pushfmtstring(L, "{}: {}: \"{}\"", op, ec.message(), u8tostrview(path1.generic_u8string())); + lua_pushfmtstring(L, "{}: {}: \"{}\"", op, ec.message(), tostring(path1)); return lua::cxx::error; } [[nodiscard]] static lua::cxx::status pusherror(lua_State* L, std::string_view op, std::error_code ec, const fs::path& path1, const fs::path& path2) { - lua_pushfmtstring(L, "{}: {}: \"{}\", \"{}\"", op, ec.message(), u8tostrview(path1.generic_u8string()), u8tostrview(path2.generic_u8string())); + lua_pushfmtstring(L, "{}: {}: \"{}\", \"{}\"", op, ec.message(), tostring(path1), tostring(path2)); return lua::cxx::error; } @@ -191,7 +200,7 @@ namespace bee::lua_filesystem { private: void conv_val() { #if defined(_WIN32) - new (&val) fs::path { win::u2w(str) }; + new (&val) fs::path { wtf8::u2w(str) }; #else new (&val) fs::path { std::string { str.data(), str.size() } }; #endif @@ -258,8 +267,7 @@ namespace bee::lua_filesystem { static int extension(lua_State* L) { const auto& self = getpath(L, 1); - auto u8str = self.extension().generic_u8string(); - auto str = u8tostrview(u8str); + auto str = tostring(self.extension()); lua_pushlstring(L, str.data(), str.size()); return 1; } @@ -327,8 +335,7 @@ namespace bee::lua_filesystem { static int mt_tostring(lua_State* L) { const auto& self = getpath(L, 1); - auto u8str = self.generic_u8string(); - auto str = u8tostrview(u8str); + auto str = tostring(self); lua_pushlstring(L, str.data(), str.size()); return 1; } diff --git a/binding/lua_subprocess.cpp b/binding/lua_subprocess.cpp index f2be8be7..7473bf1f 100644 --- a/binding/lua_subprocess.cpp +++ b/binding/lua_subprocess.cpp @@ -12,7 +12,7 @@ #if defined(_WIN32) # include -# include +# include #else # include #endif @@ -26,7 +26,7 @@ namespace bee::lua_subprocess { static string_type checkstring(lua_State* L, int idx) { auto str = lua::checkstrview(L, idx); #if defined(_WIN32) - return win::u2w(str); + return wtf8::u2w(str); #else return string_type { str.data(), str.size() }; #endif diff --git a/binding/port/lua_windows.cpp b/binding/port/lua_windows.cpp index b0b75ed4..da07400c 100644 --- a/binding/port/lua_windows.cpp +++ b/binding/port/lua_windows.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -51,7 +52,7 @@ namespace bee::lua_windows { static int write_console(lua_State* L) { luaL_Stream* p = lua::tofile(L, 1); - auto msg = win::u2w(lua::checkstrview(L, 2)); + auto msg = wtf8::u2w(lua::checkstrview(L, 2)); if (!p || !p->closef || !p->f) { lua_pushnil(L); lua_pushstring(L, make_error(std::make_error_code(std::errc::bad_file_descriptor), "write_console").c_str()); diff --git a/bootstrap/main.cpp b/bootstrap/main.cpp index 45dca7cc..8f755112 100644 --- a/bootstrap/main.cpp +++ b/bootstrap/main.cpp @@ -2,6 +2,10 @@ # define _CRT_SECURE_NO_WARNINGS #endif +#if defined(_WIN32) +# include +#endif + #include #include #include @@ -130,12 +134,6 @@ static int pushargs(lua_State *L) { return n; } -template -static std::string_view tostrview(const std::basic_string &u8str) { - static_assert(sizeof(CharT) == sizeof(char)); - return { reinterpret_cast(u8str.data()), u8str.size() }; -} - static fs::path pushprogdir(lua_State *L) { auto r = bee::path_helper::exe_path(); if (!r) { @@ -150,12 +148,13 @@ static void init_cpath(lua_State *L) { auto progdir = pushprogdir(L); #if defined(_WIN32) progdir /= L"?.dll"; + auto wstr = progdir.generic_wstring(); + auto str = bee::wtf8::w2u(wstr); #else progdir /= L"?.so"; + auto str = progdir.generic_string(); #endif - auto str = progdir.generic_u8string(); - auto strview = tostrview(str); - lua_pushlstring(L, strview.data(), strview.size()); + lua_pushlstring(L, str.data(), str.size()); lua_setfield(L, -2, "cpath"); lua_pop(L, 1); } diff --git a/compile/common.lua b/compile/common.lua index 0428b1d0..6005dc91 100644 --- a/compile/common.lua +++ b/compile/common.lua @@ -178,6 +178,7 @@ lm:lua_source "source_bee" { } lm:source_set "source_lua" { + includes = ".", sources = "3rd/lua/utf8_crt.c", } diff --git a/compile/lua.lua b/compile/lua.lua index 6d213d4d..94b35f04 100644 --- a/compile/lua.lua +++ b/compile/lua.lua @@ -13,6 +13,7 @@ lm:lua_dll "bee" { if lm.os == "windows" then lm:source_set "lua54" { + includes = ".", sources = "3rd/lua/utf8_crt.c", } lm:shared_library "lua54" {