From 2a62f8e08fe192ca6dad6b1dbf180fbee6da546e Mon Sep 17 00:00:00 2001 From: Arnaud Bouchez Date: Mon, 25 Jan 2021 12:54:17 +0100 Subject: [PATCH] properly implement and document the Unicode Replacement Character --- src/core/mormot.core.search.pas | 4 +-- src/core/mormot.core.unicode.pas | 56 +++++++++++++++++++------------- src/orm/mormot.orm.core.pas | 3 +- 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/core/mormot.core.search.pas b/src/core/mormot.core.search.pas index 4d5ce68db..b2458e5b0 100644 --- a/src/core/mormot.core.search.pas +++ b/src/core/mormot.core.search.pas @@ -1105,7 +1105,7 @@ TSynValidateText = class(TSynValidate) read fProps[9] write fProps[9]; /// defines if lengths parameters expects UTF-8 or UTF-16 codepoints number // - with default FALSE, the length is calculated with UTF-16 Unicode - // codepoints - MaxLength may not match the Ucs4 glyphs number, in case of + // codepoints - MaxLength may not match the UCS4 CodePoint, in case of // UTF-16 surrogates // - you can set this property to TRUE so that the UTF-8 byte count would // be used for truncation againts the MaxLength parameter @@ -1222,7 +1222,7 @@ TSynFilterTruncate = class(TSynFilter) read fMaxLength write fMaxLength; /// defines if MaxLength is stored as UTF-8 or UTF-16 codepoints number // - with default FALSE, the length is calculated with UTF-16 Unicode - // codepoints - MaxLength may not match the Ucs4 glyphs number, in case of + // codepoints - MaxLength may not match the UCS4 CodePoint, in case of // UTF-16 surrogates // - you can set this property to TRUE so that the UTF-8 byte count would // be used for truncation againts the MaxLength parameter diff --git a/src/core/mormot.core.unicode.pas b/src/core/mormot.core.unicode.pas index fa9a05f1e..e6a854166 100644 --- a/src/core/mormot.core.unicode.pas +++ b/src/core/mormot.core.unicode.pas @@ -68,8 +68,14 @@ TUtf8Table = record UTF16_LOSURROGATE_MIN = $dc00; UTF16_LOSURROGATE_MAX = $dfff; + /// replace any incoming character whose value is unrepresentable in Unicode + // - set e.g. by GetUtf8WideChar(), Utf8UpperReference() or + // RawUnicodeToUtf8() when ccfReplacementCharacterForUnmatchedSurrogate is set + // - encoded as $ef $bf $bd bytes in UTF-8 + UNICODE_REPLACEMENT_CHARACTER = $fffd; -/// internal function, used to retrieve a Ucs4 codepoint (>127) from UTF-8 + +/// internal function, used to retrieve a UCS4 CodePoint (>127) from UTF-8 // - not to be called directly, but from inlined higher-level functions // - here U^ shall be always >= #80 // - typical use is as such: @@ -79,12 +85,12 @@ TUtf8Table = record // ! ch := GetHighUtf8Ucs4(P); function GetHighUtf8Ucs4(var U: PUtf8Char): PtrUInt; -/// get the WideChar stored in P^ (decode UTF-8 if necessary) -// - any surrogate (Ucs4>$ffff) will be returned as '?' -function GetUtf8Char(P: PUtf8Char): cardinal; +/// decode UTF-16 WideChar from UTF-8 input buffer +// - any surrogate (Ucs4>$ffff) is returned as UNICODE_REPLACEMENT_CHARACTER=$fffd +function GetUtf8WideChar(P: PUtf8Char): cardinal; {$ifdef HASINLINE}inline;{$endif} -/// get the Ucs4 char stored in P^ (decode UTF-8 if necessary) +/// get the UCS4 CodePoint stored in P^ (decode UTF-8 if necessary) function NextUtf8Ucs4(var P: PUtf8Char): cardinal; {$ifdef HASINLINE}inline;{$endif} @@ -94,13 +100,13 @@ function NextUtf8Ucs4(var P: PUtf8Char): cardinal; function WideCharToUtf8(Dest: PUtf8Char; aWideChar: PtrUInt): integer; {$ifdef HASINLINE}inline;{$endif} - /// UTF-8 encode one UTF-16 encoded Ucs4 character into Dest + /// UTF-8 encode one UTF-16 encoded UCS4 CodePoint into Dest // - return the number of bytes written into Dest (i.e. from 1 up to 6) // - Source will contain the next UTF-16 character // - this method DOES handle UTF-16 surrogate pairs function Utf16CharToUtf8(Dest: PUtf8Char; var Source: PWord): integer; -/// UTF-8 encode one Ucs4 character into Dest +/// UTF-8 encode one UCS4 CodePoint into Dest // - return the number of bytes written into Dest (i.e. from 1 up to 6) // - this method DOES handle UTF-16 surrogate pairs function Ucs4ToUtf8(ucs4: cardinal; Dest: PUtf8Char): PtrInt; @@ -126,8 +132,8 @@ function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; // since Delphi 2009+ // - append a trailing #0 to the ending PUtf8Char, unless ccfNoTrailingZero is set // - if ccfReplacementCharacterForUnmatchedSurrogate is set, this function will identify -// unmatched surrogate pairs and replace them with EF BF BD / FFFD Unicode -// Replacement character - see https://en.wikipedia.org/wiki/Specials_(Unicode_block) +// unmatched surrogate pairs and replace them with UNICODE_REPLACEMENT_CHARACTER - +// see https://en.wikipedia.org/wiki/Specials_(Unicode_block) function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar; SourceLen: PtrInt; Flags: TCharConversionFlags): PtrInt; overload; @@ -161,10 +167,11 @@ function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char; MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean = false): PtrInt; overload; /// direct conversion of a UTF-8 encoded buffer into a WinAnsi shortstring buffer +// - non WinAnsi chars are replaced by '?' placeholders procedure Utf8ToShortString(var dest: shortstring; source: PUtf8Char); /// calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^ -// - count may not match the Ucs4 glyphs number, in case of UTF-16 surrogates +// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates // - faster than System.Utf8ToUnicode with dest=nil function Utf8ToUnicodeLength(source: PUtf8Char): PtrUInt; @@ -192,7 +199,7 @@ function IsValidUtf8WithoutControlChars(const source: RawUtf8): boolean; overloa /// will truncate the supplied UTF-8 value if its length exceeds the specified // UTF-16 Unicode characters count -// - count may not match the Ucs4 glyphs number, in case of UTF-16 surrogates +// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates // - returns FALSE if text was not truncated, TRUE otherwise function Utf8TruncateToUnicodeLength(var text: RawUtf8; maxUtf16: integer): boolean; @@ -219,7 +226,7 @@ function Utf8TruncatedLength(text: PAnsiChar; textlen, maxBytes: PtrUInt): PtrInt; overload; /// calculate the UTF-16 Unicode characters count of the UTF-8 encoded first line -// - count may not match the Ucs4 glyphs number, in case of UTF-16 surrogates +// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates // - end the parsing at first #13 or #10 character function Utf8FirstLineToUnicodeLength(source: PUtf8Char): PtrInt; @@ -377,6 +384,7 @@ TSynAnsiFixedWidth = class(TSynAnsiConvert) /// direct conversion of an UTF-8 encoded buffer into a PAnsiChar buffer // - Dest^ buffer must be reserved with at least SourceChars bytes // - no trailing #0 is appended to the buffer + // - non Ansi compatible characters are replaced as '?' function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; override; /// conversion of a wide char into the corresponding Ansi character @@ -1151,9 +1159,9 @@ function StrCompIL(P1, P2: pointer; L: PtrInt; Default: PtrInt = 0): PtrInt; function StrIComp(Str1, Str2: pointer): PtrInt; {$ifdef HASINLINE}inline;{$endif} -/// retrieve the next Ucs4 value stored in U, then update the U pointer +/// retrieve the next UCS4 CodePoint stored in U, then update the U pointer // - this function will decode the UTF-8 content before using NormToUpper[] -// - will return '?' if the Ucs4 value is higher than #255: so use this function +// - will return '?' if the UCS4 CodePoint is higher than #255: so use this function // only if you need to deal with ASCII characters (e.g. it's used for Soundex // and for ContainsUTF8 function) function GetNextUtf8Upper(var U: PUtf8Char): PtrUInt; @@ -1384,6 +1392,7 @@ function AnsiIComp(Str1, Str2: pointer): PtrInt; // - won't call the Operating System, so is consistent on all platforms, // whereas UpperCaseUnicode() may vary depending on each library implementation // - some codepoints enhance in length, so D^ should be at least twice than S^ +// - any invalid input is replaced by UNICODE_REPLACEMENT_CHARACTER=$fffd // - won't use temporary UTF-16 decoding, and optimized for plain ASCII content function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char; @@ -1453,7 +1462,7 @@ function GetHighUtf8Ucs4(var U: PUtf8Char): PtrUInt; result := c; end; -function GetUtf8Char(P: PUtf8Char): cardinal; +function GetUtf8WideChar(P: PUtf8Char): cardinal; begin if P <> nil then begin @@ -1462,7 +1471,8 @@ function GetUtf8Char(P: PUtf8Char): cardinal; begin result := GetHighUtf8Ucs4(P); if result > $ffff then - result := ord('?'); // do not handle surrogates now + // surrogates can't be stored in a single UTF-16 WideChar + result := UNICODE_REPLACEMENT_CHARACTER; end; end else @@ -1483,7 +1493,7 @@ function NextUtf8Ucs4(var P: PUtf8Char): cardinal; inc(P, 2); end else - result := GetHighUtf8Ucs4(P); // handle even surrogates + result := GetHighUtf8Ucs4(P); // handle even UTF-16 surrogates end else result := 0; @@ -1626,7 +1636,7 @@ function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar; inc(Dest, 2); until (Source > Tail) or (PtrInt(PtrUInt(Dest)) >= DestLen); - // generic loop, handling one Ucs4 char per iteration + // generic loop, handling one UCS4 CodePoint per iteration if (PtrInt(PtrUInt(Dest)) < DestLen) and (PtrInt(PtrUInt(Source)) < SourceLen) then repeat @@ -1652,7 +1662,7 @@ function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar; unmatch: if (PtrInt(PtrUInt(@Dest[3])) > DestLen) or not (ccfReplacementCharacterForUnmatchedSurrogate in Flags) then break; - PWord(Dest)^ := $BFEF; // store Unicode Replacement Char + PWord(Dest)^ := $BFEF; // UTF-8 UNICODE_REPLACEMENT_CHARACTER Dest[2] := AnsiChar($BD); inc(Dest, 3); if (PtrInt(PtrUInt(Dest)) < DestLen) and @@ -4986,7 +4996,7 @@ function Utf8IComp(u1, u2: PUtf8Char): PtrInt; else begin result := GetHighUtf8Ucs4(u1); - if result and $ffffff00 = 0 then + if result <= 255 then result := table[result]; // 8 bits to upper, 32-bit as is end; if c2 <= 127 then @@ -5476,7 +5486,7 @@ function Utf8UpperCopy(Dest, Source: PUtf8Char; SourceChars: cardinal): PUtf8Cha Dest[3] := up[ToByte(c shr 24)]; inc(Dest, 4); until Source > endSourceBy4; - // generic loop, handling one Ucs4 char per iteration + // generic loop, handling one UCS4 CodePoint per iteration if Source < endSource then repeat By1: c := byte(Source^); @@ -6091,7 +6101,7 @@ function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char; c := GetHighUtf8Ucs4(S2); // handle even surrogates S := S2; if c = 0 then - c := ord('?'); // PlaceHolder for invalid UTF-8 input + c := UNICODE_REPLACEMENT_CHARACTER; // =$fffd for invalid input end; if c <= UU_MAX then c := tab.Ucs4Upper(c); @@ -6239,9 +6249,9 @@ function Utf8ILCompReference(u1, u2: PUtf8Char; L1, L2: integer): PtrInt; if c2 <= 127 then begin inc(c2, tab.Block[0, c2]); - dec(result, c2); dec(L2); inc(u2); + dec(result, c2); if result <> 0 then // found unmatching char exit diff --git a/src/orm/mormot.orm.core.pas b/src/orm/mormot.orm.core.pas index 18d5a0bde..a6299c927 100644 --- a/src/orm/mormot.orm.core.pas +++ b/src/orm/mormot.orm.core.pas @@ -10171,7 +10171,8 @@ procedure TOrmPropInfoRttiChar.SetValue(Instance: TObject; Value: PUtf8Char; if (Value = nil) or (PInteger(Value)^ = NULL_LOW) then i := 0 else - i := GetUtf8Char(Value); + // decode one UTF-16 or return UNICODE_REPLACEMENT_CHARACTER + i := GetUtf8WideChar(Value); fPropInfo.SetOrdProp(Instance, i); end;