diff --git a/source/numem/unicode/package.d b/source/numem/unicode/package.d index 63fc047..3e09e82 100644 --- a/source/numem/unicode/package.d +++ b/source/numem/unicode/package.d @@ -12,8 +12,16 @@ alias codepoint = uint; Validates whether the codepoint is within spec */ bool validate(codepoint code) { - return code <= 0x10FFFF; + return code <= 0x10FFFF && !hasSurrogatePairs(code); } + +/** + Gets whether the codepoint mistakenly has surrogate pairs encoded within it. +*/ +bool hasSurrogatePairs(codepoint code) { + return (code >= 0x0000D800 && code <= 0x0000DFFF); +} + /** Validates whether the codepoint is within spec */ diff --git a/source/numem/unicode/utf16.d b/source/numem/unicode/utf16.d new file mode 100644 index 0000000..22ed84b --- /dev/null +++ b/source/numem/unicode/utf16.d @@ -0,0 +1,165 @@ +module numem.unicode.utf16; +import numem.unicode; +import numem.mem.string; +import numem.mem.vector; + +nothrow @nogc: + +private { + + // Surrogate mask + enum ushort utf16_smask = 0b11111100_00000000; + + // Data mask + enum ushort utf16_dmask = cast(ushort)(~utf16_smask); + + /// Leading surrogate + enum wchar utf16_lead = 0b11011000_00000000; + + /// Trailing surrogate + enum wchar utf16_trail = 0b11011100_00000000; +} + +/** + Validates whether the given character is a valid UTF-16 sequence +*/ +bool validate(wchar[2] c) { + return + ((c[0] >= 0 && c[0] <= 0xD7FF) || (c[0] >= 0xE000 && c[0] <= 0xFFFF)) || + ((c[0] & utf16_smask) == utf16_lead && ((c[1] & utf16_smask) == utf16_trail)); +} + +/** + Gets how many utf-16 units are in the specified character +*/ +size_t getLength(wchar c) { + if ((c >= 0 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFF)) return 1; + if ((c & utf16_smask) == utf16_lead) return 2; + return 0; +} + +@("UTF-16 char len") +unittest { + assert('a'.getLength == 1); + assert('あ'.getLength == 1); + assert(utf16_trail.getLength() == 0); // Malformed leading byte +} + +/** + Gets how many utf-16 units are in the specified codepoint + + Returns 0 if the codepoint can't be represented. +*/ +size_t getUTF16Length(codepoint code) { + if (code <= 0xD7FF || (code >= 0xE000 && code <= 0xFFFF)) return 1; + else if (code >= 0x010000 && code <= 0x10FFFF) return 2; + return 0; +} + +@("UTF-16 codepoint len") +unittest { + assert(0xF4.getUTF16Length == 1); + assert(0x10FFFF.getUTF16Length == 2); + assert(0x11FFFF.getUTF16Length == 0); +} + +/** + Decodes a single utf-16 character +*/ +codepoint decode(wchar[2] chr, ref size_t read) { + read = chr[0].getLength(); + switch(read) { + default: + read = 1; + return unicodeReplacementCharacter; + + case 1: + return cast(codepoint)chr[0]; + + case 2: + codepoint code = + ((chr[0] & utf16_dmask) + 0x400) + + ((chr[1] & utf16_dmask) + 0x37) + + 0x10000; + return code; + } +} + +/** + Decodes a utf-16 string +*/ +UnicodeSequence decode(nwstring str) { + UnicodeSequence code; + + size_t i = 0; + while(i < str.size()) { + wchar[2] txt; + + // Validate length, add FFFD if invalid. + size_t clen = str[i].getLength(); + if (clen >= i+str.size() || clen == 0) { + code ~= unicodeReplacementCharacter; + i++; + } + + txt[0..clen] = str[i..i+clen]; + code ~= txt.decode(clen); + i += clen; + } + + return code; +} + +@("UTF-16 decode string") +unittest { + codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; + codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; + assert(decode(nwstring("こんにちは世界!"w))[0..$] == seq1); + assert(decode(nwstring("こ\uFFFDにちは世界!"w))[0..$] == seq2); +} + +/** + Encodes a unicode sequence to UTF-16 +*/ +nwstring encode(UnicodeSlice slice) { + nwstring out_; + + size_t i = 0; + while(i < slice.length) { + wchar[2] txt; + + size_t clen = slice[i].getUTF16Length(); + if (clen == 1) { + txt[0] = cast(wchar)slice[i]; + out_ ~= txt[0]; + } if (clen == 2) { + codepoint c = slice[i] - 0x10000; + + txt[0] = cast(wchar)((c >> 10) + 0xD800); + txt[1] = cast(wchar)((c << 10) + 0xDC00); + out_ ~= cast(wstring)txt[0..$]; + } else { + i++; + continue; + } + + i++; + } + + return out_; +} + +/** + Encodes a series of unicode codepoints to UTF-16 +*/ +nwstring encode(UnicodeSequence sequence) { + return encode(sequence[0..$]); +} + +@("UTF-16 encode") +unittest { + codepoint[8] seq1 = [0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; + codepoint[8] seq2 = [0x3053, unicodeReplacementCharacter, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c, 0xff01]; + assert(encode(seq1) == "こんにちは世界!"w); + assert(encode(seq2) == "こ\uFFFDにちは世界!"w); +} \ No newline at end of file diff --git a/source/numem/unicode/utf8.d b/source/numem/unicode/utf8.d index 1b0f2a5..dc6b85d 100644 --- a/source/numem/unicode/utf8.d +++ b/source/numem/unicode/utf8.d @@ -3,6 +3,9 @@ import numem.unicode; import numem.mem.string; import numem.mem.vector; +// For some reason D really wants this import. +import numem.unicode : validate; + @nogc nothrow: private { @@ -170,6 +173,28 @@ unittest { assert((0b10010101).getLength() == 0); // Malformed leading byte } +/** + Gets how many utf-16 units are in the specified codepoint + + Returns 0 if the codepoint can't be represented. +*/ +size_t getUTF8Length(codepoint code) { + if (code <= 0x7F) return 1; + else if (code >= 0x0080 && code <= 0x07FF) return 2; + else if (code >= 0x0800 && code <= 0xFFFF) return 3; + else if (code >= 0x010000 && code <= 0x10FFFF) return 4; + return 0; +} + +@("UTF-8 codepoint len") +unittest { + assert(0x1.getUTF8Length == 1); + assert(0xF4.getUTF8Length == 2); + assert(0x0801.getUTF8Length == 3); + assert(0x010001.getUTF8Length == 4); + assert(0x11FFFF.getUTF8Length == 0); +} + /** Decodes a UTF-8 character @@ -282,43 +307,50 @@ unittest { } /** - Encodes a series of unicode sequences to UTF-8 + Encodes a series of unicode codepoints to UTF-8 */ -nstring encode(UnicodeSlice sequence) { +nstring encode(UnicodeSlice slice) { nstring out_; size_t i = 0; - while(i < sequence.length) { + while(i < slice.length) { ptrdiff_t count = 0; ptrdiff_t offset = 0; - if (sequence[i] <= utf8_ascii) { + + // Skip invalid codepoints. + if (!slice[i].validate()) { + i++; + continue; + } + + if (slice[i] <= utf8_ascii) { // Single-byte ascii - out_ ~= cast(char)sequence[i++]; + out_ ~= cast(char)slice[i++]; continue; - } else if (sequence[i] >= 0x0080 && sequence[i] <= 0x07FF) { + } else if (slice[i] >= 0x0080 && slice[i] <= 0x07FF) { // 2 byte count = 1; offset = 0xC0; - } else if (sequence[i] >= 0x0800 && sequence[i] <= 0xFFFF) { + } else if (slice[i] >= 0x0800 && slice[i] <= 0xFFFF) { // 2 byte count = 2; offset = 0xE0; - } else if (sequence[i] >= 0x10000 && sequence[i] <= 0x10FFFF) { + } else if (slice[i] >= 0x10000 && slice[i] <= 0x10FFFF) { // 2 byte count = 3; offset = 0xF0; } - + // The magic where things get stitched back together. char[4] bytes; - bytes[0] = cast(ubyte)((sequence[i] >> (6 * count)) + offset); + bytes[0] = cast(ubyte)((slice[i] >> (6 * count)) + offset); size_t ix = 1; while (count > 0) { - size_t temp = sequence[i] >> (6 * (count - 1)); + size_t temp = slice[i] >> (6 * (count - 1)); bytes[ix++] = 0x80 | (temp & 0x3F); count--; } @@ -331,7 +363,7 @@ nstring encode(UnicodeSlice sequence) { } /** - Encodes a series of unicode sequences to UTF-8 + Encodes a series of unicode codepoints to UTF-8 */ nstring encode(UnicodeSequence sequence) { return encode(sequence[0..$]);