From 45efcc1743cc167df4d86a5e9cda7de543952e23 Mon Sep 17 00:00:00 2001 From: Alex Alabuzhev Date: Sun, 6 Jun 2021 21:02:32 +0100 Subject: [PATCH] Basic support for UTF-16 surrogates --- far/changelog | 5 +++ far/console.cpp | 97 +++++++++++++++++++++++++++++++++++++----------- far/edit.cpp | 30 ++++++++++++--- far/edit.hpp | 2 + far/editor.cpp | 57 ++++++++++++++-------------- far/encoding.cpp | 53 +++++++++++++++----------- far/encoding.hpp | 6 ++- far/interf.cpp | 13 +++++++ far/interf.hpp | 3 ++ far/scrbuf.cpp | 37 ++++++++++-------- far/vbuild.m4 | 2 +- 11 files changed, 211 insertions(+), 94 deletions(-) diff --git a/far/changelog b/far/changelog index da30e6f9b9..d49d3c5956 100644 --- a/far/changelog +++ b/far/changelog @@ -1,3 +1,8 @@ +-------------------------------------------------------------------------------- +drkns 06.06.2021 21:01:06 +0100 - build 5814 + +1. Basic support for UTF-16 surrogates. + -------------------------------------------------------------------------------- drkns 05.06.2021 23:26:24 +0100 - build 5813 diff --git a/far/console.cpp b/far/console.cpp index 6fa7a78839..040866958b 100644 --- a/far/console.cpp +++ b/far/console.cpp @@ -69,6 +69,7 @@ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static bool sWindowMode; static bool sEnableVirtualTerminal; +constexpr auto bad_char_replacement = L' '; wchar_t ReplaceControlCharacter(wchar_t const Char) { @@ -149,18 +150,18 @@ wchar_t ReplaceControlCharacter(wchar_t const Char) } } -static void sanitise_dbsc_pair(FAR_CHAR_INFO& First, FAR_CHAR_INFO& Second) +static bool sanitise_dbsc_pair(FAR_CHAR_INFO& First, FAR_CHAR_INFO& Second) { - if (!(First.Attributes.Flags & COMMON_LVB_LEADING_BYTE) && !(Second.Attributes.Flags & COMMON_LVB_TRAILING_BYTE)) - { - // Not DBSC, awesome - return; - } - const auto IsFirst = flags::check_any(First.Attributes.Flags, COMMON_LVB_LEADING_BYTE), IsSecond = flags::check_any(Second.Attributes.Flags, COMMON_LVB_TRAILING_BYTE); + if (!IsFirst && !IsSecond) + { + // Not DBSC, awesome + return false; + } + flags::clear(First.Attributes.Flags, COMMON_LVB_LEADING_BYTE); flags::clear(Second.Attributes.Flags, COMMON_LVB_TRAILING_BYTE); @@ -170,14 +171,48 @@ static void sanitise_dbsc_pair(FAR_CHAR_INFO& First, FAR_CHAR_INFO& Second) flags::set(First.Attributes.Flags, COMMON_LVB_LEADING_BYTE); flags::set(Second.Attributes.Flags, COMMON_LVB_TRAILING_BYTE); - return; + return false; } if (IsFirst) - First.Char = L' '; + First.Char = bad_char_replacement; if (IsSecond) - Second.Char = L' '; + Second.Char = bad_char_replacement; + + return true; +} + +static bool sanitise_surrogate_pair(FAR_CHAR_INFO& First, FAR_CHAR_INFO& Second) +{ + const auto + IsFirst = encoding::utf16::is_high_surrogate(First.Char), + IsSecond = encoding::utf16::is_low_surrogate(Second.Char); + + if (!IsFirst && !IsSecond) + { + // Not surrogate, awesome + return false; + } + + if (encoding::utf16::is_valid_surrogate_pair(First.Char, Second.Char) && First.Attributes == Second.Attributes) + { + // Valid surrogate, awesome + return false; + } + + if (IsFirst) + First.Char = bad_char_replacement; + + if (IsSecond) + Second.Char = bad_char_replacement; + + return true; +} + +void sanitise_pair(FAR_CHAR_INFO& First, FAR_CHAR_INFO& Second) +{ + sanitise_dbsc_pair(First, Second) || sanitise_surrogate_pair(First, Second); } static COORD make_coord(point const& Point) @@ -804,14 +839,14 @@ namespace console_detail { if (n != Input.size() - 1) { - sanitise_dbsc_pair(Cell, Input[n + 1]); + sanitise_pair(Cell, Input[n + 1]); } if (Cell.Attributes.Flags & COMMON_LVB_TRAILING_BYTE) { if (!LeadingChar) { - Cell.Char = L' '; + Cell.Char = bad_char_replacement; flags::clear(Cell.Attributes.Flags, COMMON_LVB_TRAILING_BYTE); } else if (Cell.Char == *LeadingChar) @@ -820,6 +855,10 @@ namespace console_detail continue; } } + else if (!n && encoding::utf16::is_low_surrogate(Cell.Char)) + { + Cell.Char = bad_char_replacement; + } LeadingChar.reset(); @@ -828,13 +867,17 @@ namespace console_detail if (n == Input.size() - 1) { flags::clear(Cell.Attributes.Flags, COMMON_LVB_LEADING_BYTE); - Cell.Char = L' '; + Cell.Char = bad_char_replacement; } else { LeadingChar = Cell.Char; } } + else if (n == Input.size() - 1 && encoding::utf16::is_high_surrogate(Cell.Char)) + { + Cell.Char = bad_char_replacement; + } } if (!LastColor.has_value() || Cell.Attributes != *LastColor) @@ -846,7 +889,7 @@ namespace console_detail if (CharWidthEnabled && Cell.Char == encoding::replace_char && Cell.Attributes.Reserved[0] > std::numeric_limits::max()) { const auto Pair = encoding::utf16::to_surrogate(Cell.Attributes.Reserved[0]); - Str.append(ALL_CONST_RANGE(Pair)); + append(Str, Pair.first, Pair.second); } else { @@ -999,18 +1042,29 @@ namespace console_detail if (Cell.Attributes.Flags & COMMON_LVB_TRAILING_BYTE) { flags::clear(Cell.Attributes.Flags, COMMON_LVB_TRAILING_BYTE); - Cell.Char = L' '; + Cell.Char = bad_char_replacement; + } + else if (encoding::utf16::is_low_surrogate(Cell.Char)) + { + Cell.Char = bad_char_replacement; } } if (Col != SubRect.width() - 1) { - sanitise_dbsc_pair(Cell, Buffer[SubRect.top + Row][SubRect.left + Col + 1]); + sanitise_pair(Cell, Buffer[SubRect.top + Row][SubRect.left + Col + 1]); } - else if (Cell.Attributes.Flags & COMMON_LVB_LEADING_BYTE) + else { - flags::clear(Cell.Attributes.Flags, COMMON_LVB_LEADING_BYTE); - Cell.Char = L' '; + if (Cell.Attributes.Flags & COMMON_LVB_LEADING_BYTE) + { + flags::clear(Cell.Attributes.Flags, COMMON_LVB_LEADING_BYTE); + Cell.Char = bad_char_replacement; + } + else if (encoding::utf16::is_high_surrogate(Cell.Char)) + { + Cell.Char = bad_char_replacement; + } } ConsoleBuffer.emplace_back(CHAR_INFO{ { ReplaceControlCharacter(Cell.Char) }, colors::FarColorToConsoleColor(Cell.Attributes) }); @@ -1618,8 +1672,9 @@ namespace console_detail return false; DWORD Written; - auto Pair = encoding::utf16::to_surrogate(Codepoint); - if (!WriteConsole(m_WidthTestScreen.native_handle(), Pair.data(), Pair[1]? 2 : 1, &Written, {})) + const auto Pair = encoding::utf16::to_surrogate(Codepoint); + std::array Chars = { Pair.first, Pair.second }; + if (!WriteConsole(m_WidthTestScreen.native_handle(), Chars.data(), Pair.second? 2 : 1, &Written, {})) return false; CONSOLE_SCREEN_BUFFER_INFO Info; diff --git a/far/edit.cpp b/far/edit.cpp index b5a1f089fb..ab877a2082 100644 --- a/far/edit.cpp +++ b/far/edit.cpp @@ -670,6 +670,7 @@ bool Edit::ProcessKey(const Manager::Key& Key) { AdjustPersistentMark(); + const auto SavedCurPos = m_CurPos; RecurseProcessKey(KEY_LEFT); if (!m_Flags.Check(FEDITLINE_MARKINGBLOCK)) @@ -682,7 +683,7 @@ bool Edit::ProcessKey(const Manager::Key& Key) Select(m_SelStart,m_CurPos); else { - int EndPos=m_CurPos+1; + int EndPos = SavedCurPos; int NewStartPos=m_CurPos; if (EndPos>m_Str.size()) @@ -703,23 +704,27 @@ bool Edit::ProcessKey(const Manager::Key& Key) { AdjustPersistentMark(); + const auto SavedCurPos = m_CurPos; + RecurseProcessKey(KEY_RIGHT); + if (!m_Flags.Check(FEDITLINE_MARKINGBLOCK)) { RemoveSelection(); m_Flags.Set(FEDITLINE_MARKINGBLOCK); } - if ((m_SelStart!=-1 && m_SelEnd==-1) || m_SelEnd>m_CurPos) + if ((m_SelStart != -1 && m_SelEnd == -1) || m_SelEnd > SavedCurPos) { - if (m_CurPos+1==m_SelEnd) + if (m_CurPos == m_SelEnd) RemoveSelection(); else - Select(m_CurPos+1,m_SelEnd); + Select(m_CurPos, m_SelEnd); } else - AddSelect(m_CurPos,m_CurPos+1); + AddSelect(SavedCurPos, m_CurPos); + + Show(); - RecurseProcessKey(KEY_RIGHT); return true; } case KEY_CTRLSHIFTLEFT: case KEY_CTRLSHIFTNUMPAD4: @@ -1044,6 +1049,10 @@ bool Edit::ProcessKey(const Manager::Key& Key) { SetPrevCurPos(m_CurPos); m_CurPos--; + + if (m_CurPos && is_valid_surrogate_pair_at(m_CurPos - 1)) + --m_CurPos; + Show(); } @@ -1064,6 +1073,9 @@ bool Edit::ProcessKey(const Manager::Key& Key) else m_CurPos++; + if (m_CurPos && is_valid_surrogate_pair_at(m_CurPos - 1)) + ++m_CurPos; + Show(); return true; } @@ -2386,6 +2398,12 @@ Editor* Edit::GetEditor() const return nullptr; } +bool Edit::is_valid_surrogate_pair_at(size_t const Position) const +{ + string_view const Str(m_Str); + return Position < Str.size() && is_valid_surrogate_pair(Str.substr(Position)); +} + #ifdef ENABLE_TESTS #include "testing.hpp" diff --git a/far/edit.hpp b/far/edit.hpp index 4d84f1a53f..5b7eb8aa4d 100644 --- a/far/edit.hpp +++ b/far/edit.hpp @@ -231,6 +231,8 @@ class Edit: public SimpleScreenObject void SetRightCoord(int Value) { SetPosition({ m_Where.left, m_Where.top, Value, m_Where.bottom }); } Editor* GetEditor() const; + bool is_valid_surrogate_pair_at(size_t Position) const; + protected: // BUGBUG: the whole purpose of this class is to avoid zillions of casts in existing code by returning size() as int // Remove it after fixing all signed/unsigned mess diff --git a/far/editor.cpp b/far/editor.cpp index f8600e19be..62c55f1396 100644 --- a/far/editor.cpp +++ b/far/editor.cpp @@ -1069,37 +1069,38 @@ bool Editor::ProcessKeyInternal(const Manager::Key& Key, bool& Refresh) if (!CurPos && m_it_CurLine == Lines.begin()) return true; - if (!CurPos) //курсор в начале строки + const auto OldCur = m_it_CurLine; + Pasting++; + ProcessKeyInternal(Manager::Key(KEY_LEFT), Refresh); + Pasting--; + + if (OldCur == m_it_CurLine) { - const auto PrevLine = std::prev(m_it_CurLine); - if (SelAtBeginning) //курсор в начале блока + if (SelAtBeginning || SelFirst) { - m_it_AnyBlockStart = PrevLine; - PrevLine->Select(PrevLine->GetLength(), -1); + m_it_CurLine->Select(m_it_CurLine->GetCurPos(), SelEnd); } - else // курсор в конце блока + else { - m_it_CurLine->RemoveSelection(); - PrevLine->GetRealSelection(SelStart, SelEnd); - PrevLine->Select(SelStart, PrevLine->GetLength()); + m_it_CurLine->Select(SelStart, m_it_CurLine->GetCurPos()); } } else { - if (SelAtBeginning || SelFirst) + const auto PrevLine = std::prev(m_it_CurLine); + if (SelAtBeginning) //курсор в начале блока { - m_it_CurLine->Select(SelStart-1,SelEnd); + m_it_AnyBlockStart = m_it_CurLine; + m_it_CurLine->Select(m_it_CurLine->GetLength(), -1); } - else + else // курсор в конце блока { - m_it_CurLine->Select(SelStart,SelEnd-1); + OldCur->RemoveSelection(); + m_it_CurLine->GetRealSelection(SelStart, SelEnd); + m_it_CurLine->Select(SelStart, PrevLine->GetLength()); } } - Pasting++; - ProcessKeyInternal(Manager::Key(KEY_LEFT), Refresh); - Pasting--; - Refresh = true; return true; } @@ -1110,21 +1111,23 @@ bool Editor::ProcessKeyInternal(const Manager::Key& Key, bool& Refresh) return true; } - if (SelAtBeginning) - { - m_it_CurLine->Select(SelStart+1,SelEnd); - } - else - { - m_it_CurLine->Select(SelStart,SelEnd+1); - } - const auto OldCur = m_it_CurLine; Pasting++; ProcessKeyInternal(Manager::Key(KEY_RIGHT), Refresh); Pasting--; - if (OldCur != m_it_CurLine) + if (OldCur == m_it_CurLine) + { + if (SelAtBeginning) + { + m_it_CurLine->Select(m_it_CurLine->GetCurPos(), SelEnd); + } + else + { + m_it_CurLine->Select(SelStart, m_it_CurLine->GetCurPos()); + } + } + else { if (SelAtBeginning) { diff --git a/far/encoding.cpp b/far/encoding.cpp index 0e1b87dab6..93881df35e 100644 --- a/far/encoding.cpp +++ b/far/encoding.cpp @@ -1179,10 +1179,7 @@ size_t Utf8::get_char(std::string_view::const_iterator& StrIterator, std::string else { // legal 4-byte (produces 2 WCHARs) - const auto FullChar = utf8::extract(c1, c2, c3, c4) - 0b1'00000000'00000000; - - First = utf16::surrogate_high_first + (FullChar >> 10); - Second = utf16::surrogate_low_first + (FullChar & 0b00000011'11111111); + std::tie(First, Second) = encoding::utf16::to_surrogate(utf8::extract(c1, c2, c3, c4)); NumberOfChars = 2; StrIterator += 3; } @@ -1274,13 +1271,11 @@ static size_t utf8_get_bytes(string_view const Str, span const Buffer) BytesNumber = 1; Char &= 0b11111111; } - else if (StrIterator != StrEnd && - in_closed_range(utf16::surrogate_high_first, Char, utf16::surrogate_high_last) && - in_closed_range(utf16::surrogate_low_first, *StrIterator, utf16::surrogate_low_last)) + else if (StrIterator != StrEnd && encoding::utf16::is_valid_surrogate_pair(Char, *StrIterator)) { // valid surrogate pair BytesNumber = 4; - Char = 0b1'00000000'00000000u + ((Char - utf16::surrogate_high_first) << 10) + (*StrIterator++ - utf16::surrogate_low_first); + Char = encoding::utf16::extract_codepoint(Char, *StrIterator++); } else { @@ -1313,27 +1308,40 @@ static size_t utf8_get_bytes(string_view const Str, span const Buffer) return RequiredCapacity; } -unsigned int encoding::utf16::extract_codepoint(string_view const Str) +bool encoding::utf16::is_high_surrogate(wchar_t const Char) +{ + return in_closed_range(::utf16::surrogate_high_first, Char, ::utf16::surrogate_high_last); +} + +bool encoding::utf16::is_low_surrogate(wchar_t const Char) +{ + return in_closed_range(::utf16::surrogate_low_first, Char, ::utf16::surrogate_low_last); +} + +bool encoding::utf16::is_valid_surrogate_pair(wchar_t const First, wchar_t const Second) +{ + return is_high_surrogate(First) && is_low_surrogate(Second); +} + +unsigned int encoding::utf16::extract_codepoint(wchar_t const First, wchar_t const Second) { static_assert(sizeof(wchar_t) == 2); + return 0b1'00000000'00000000u + ((First - ::utf16::surrogate_high_first) << 10) + (Second - ::utf16::surrogate_low_first); +} - if ( - Str.size() > 1 && - in_closed_range(::utf16::surrogate_high_first, Str[0], ::utf16::surrogate_high_last) && - in_closed_range(::utf16::surrogate_low_first, Str[1], ::utf16::surrogate_low_last) - ) - { - // valid surrogate pair - return 0b1'00000000'00000000u + ((Str[0] - ::utf16::surrogate_high_first) << 10) + (Str[1] - ::utf16::surrogate_low_first); - } +unsigned int encoding::utf16::extract_codepoint(string_view const Str) +{ + static_assert(sizeof(wchar_t) == 2); - return Str.front(); + return Str.size() > 1 && is_valid_surrogate_pair(Str[0], Str[1])? + extract_codepoint(Str[0], Str[1]) : + Str.front(); } -std::array encoding::utf16::to_surrogate(unsigned int const Codepoint) +std::pair encoding::utf16::to_surrogate(unsigned int const Codepoint) { if (Codepoint <= std::numeric_limits::max()) - return { static_cast(Codepoint) }; + return { static_cast(Codepoint), 0 }; const auto TwentyBits = Codepoint - 0b1'00000000'00000000u; const auto TenBitsMask = 0b11'11111111; @@ -1826,7 +1834,8 @@ TEST_CASE("encoding.utf16.surrogate") REQUIRE(i.Codepoint == Codepoint); const auto Pair = encoding::utf16::to_surrogate(i.Codepoint); - REQUIRE(i.Pair == Pair); + REQUIRE(i.Pair[0] == Pair.first); + REQUIRE(i.Pair[1] == Pair.second); } } diff --git a/far/encoding.hpp b/far/encoding.hpp index c262549030..986a55fb7f 100644 --- a/far/encoding.hpp +++ b/far/encoding.hpp @@ -196,8 +196,12 @@ namespace encoding namespace utf16 { + bool is_high_surrogate(wchar_t Char); + bool is_low_surrogate(wchar_t Char); + bool is_valid_surrogate_pair(wchar_t First, wchar_t Second); + unsigned int extract_codepoint(wchar_t First, wchar_t Second); unsigned int extract_codepoint(string_view Str); - std::array to_surrogate(unsigned int Codepoint); + std::pair to_surrogate(unsigned int Codepoint); } } diff --git a/far/interf.cpp b/far/interf.cpp index e8bd474b3e..92df8001cd 100644 --- a/far/interf.cpp +++ b/far/interf.cpp @@ -1194,6 +1194,19 @@ size_t visual_pos_to_string_pos(string_view Str, size_t const Pos) return StrSize - Str.size() + (Pos > Size? Pos - Size : 0); } +bool is_valid_surrogate_pair(string_view const Str) +{ + if (Str.size() < 2) + return false; + + return encoding::utf16::is_valid_surrogate_pair(Str[0], Str[1]); +} + +bool is_valid_surrogate_pair(wchar_t First, wchar_t Second) +{ + return encoding::utf16::is_valid_surrogate_pair(First, Second); +} + void GetText(rectangle Where, matrix& Dest) { Global->ScrBuf->Read(Where, Dest); diff --git a/far/interf.hpp b/far/interf.hpp index 972da2e6d1..95cc62f9b7 100644 --- a/far/interf.hpp +++ b/far/interf.hpp @@ -161,6 +161,9 @@ bool DoWeReallyHaveToScroll(short Rows); size_t string_length_to_visual(string_view Str); size_t visual_pos_to_string_pos(string_view Str, size_t Pos); +bool is_valid_surrogate_pair(string_view Str); +bool is_valid_surrogate_pair(wchar_t First, wchar_t Second); + void Text(point Where, const FarColor& Color, string_view Str); size_t Text(string_view Str, size_t MaxWidth); diff --git a/far/scrbuf.cpp b/far/scrbuf.cpp index 24f93df530..0dde9cd45a 100644 --- a/far/scrbuf.cpp +++ b/far/scrbuf.cpp @@ -446,7 +446,20 @@ void ScreenBuf::Flush(flush_type FlushType) // Skip setting cursor position if it's not in the viewport to prevent Windows from repositioning the console window if (!SBFlags.Check(SBFLAGS_FLUSHEDCURPOS) && IsCursorInBuffer && console.IsPositionVisible(m_CurPos)) { - console.SetCursorPosition(m_CurPos); + auto CorrectedPosition = m_CurPos; + + if (m_CurPos.x > 0) + { + const auto& Cell = Buf[m_CurPos.y][m_CurPos.x]; + const auto& PrevCell = Buf[m_CurPos.y][m_CurPos.x - 1]; + + if (is_valid_surrogate_pair(PrevCell.Char, Cell.Char) || (char_width::is_enabled() && Cell.Attributes.Flags & COMMON_LVB_TRAILING_BYTE)) + { + --CorrectedPosition.x; + } + } + + console.SetCursorPosition(CorrectedPosition); SBFlags.Set(SBFLAGS_FLUSHEDCURPOS); } @@ -478,25 +491,17 @@ void ScreenBuf::MoveCursor(point const Point) { SCOPED_ACTION(std::lock_guard)(CS); - const auto IsNewPositionVisible = is_visible(Point); + if (Point == m_CurPos) + return; - auto CorrectedPoint = Point; - if (char_width::is_enabled() && IsNewPositionVisible && Point.x > 0 && Buf[Point.y][Point.x].Attributes.Flags & COMMON_LVB_TRAILING_BYTE) - { - --CorrectedPoint.x; - } + m_CurPos = Point; - if(Point != m_CurPos) + if (!is_visible(m_CurPos)) { - m_CurPos = CorrectedPoint; - - if (!IsNewPositionVisible) - { - CurVisible = false; - } - - SBFlags.Clear(SBFLAGS_FLUSHEDCURPOS); + CurVisible = false; } + + SBFlags.Clear(SBFLAGS_FLUSHEDCURPOS); } point ScreenBuf::GetCursorPos() const diff --git a/far/vbuild.m4 b/far/vbuild.m4 index 8244d81e95..5c22f3f453 100644 --- a/far/vbuild.m4 +++ b/far/vbuild.m4 @@ -1 +1 @@ -5813 +5814