From 001e5a6b0f282196208ba908fa0850ca8b5778ff Mon Sep 17 00:00:00 2001 From: Luna Date: Wed, 9 Oct 2024 11:01:11 +0200 Subject: [PATCH] Add UTF-32, BOM and string conversion support --- source/numem/conv.d | 8 +- source/numem/format.d | 19 +-- source/numem/io/endian.d | 22 +++- source/numem/string.d | 67 +++++++++- source/numem/text/encoding.d | 101 +++++++++++++++ source/numem/text/package.d | 5 +- source/numem/text/unicode/package.d | 73 ++++++++++- source/numem/text/unicode/utf16.d | 184 ++++++++++++++++++++++++++-- source/numem/text/unicode/utf32.d | 122 ++++++++++++++++++ source/numem/text/unicode/utf8.d | 35 ++++-- 10 files changed, 591 insertions(+), 45 deletions(-) create mode 100644 source/numem/text/encoding.d create mode 100644 source/numem/text/unicode/utf32.d diff --git a/source/numem/conv.d b/source/numem/conv.d index 76651de..a18432a 100644 --- a/source/numem/conv.d +++ b/source/numem/conv.d @@ -9,11 +9,9 @@ Utilities for converting between some basic types */ module numem.conv; -import numem.all; import core.stdc.stdlib; import std.traits; -import numem.core.exception; -import numem.format; +import numem.all; @nogc: @@ -148,6 +146,10 @@ nstring toString(T)(T item) if (__traits(hasMember, T, "toNString")) { return item.toNString(); } +nwstring toUTF16(T)(ref auto T str) if (isSomeString!T) { + +} + @("toString") unittest { assert((32u).toString() == "32"); diff --git a/source/numem/format.d b/source/numem/format.d index 1f8e422..4acc4f7 100644 --- a/source/numem/format.d +++ b/source/numem/format.d @@ -4,27 +4,14 @@ import numem.text.ascii; import numem.conv; import numem.collections; -import std.traits; +import std.traits : isBasicType; private { - enum CanConvertToNString(T) = - __traits(hasMember, T, "toNString") && - is(T.toNString : nstring function()) && - hasUDA(T.toNString, nogc); - - enum CanConvertToDString(T) = - __traits(hasMember, T, "toString") && - is(T.toNString : string function()) && - hasUDA(T.toNString, nogc); - nstring _formatSingle(T)(T element) { - static if(CanConvertToNString!T) { - - return element.toNString(); - } else static if(CanConvertToDString!T) { + static if(isStringable!T) { return nstring(element.toString()); - } else static if (is(T : string)) { + } else static if (isSomeString!T) { return nstring(element); } else static if (isBasicType!T) { diff --git a/source/numem/io/endian.d b/source/numem/io/endian.d index 8b33564..45206ed 100644 --- a/source/numem/io/endian.d +++ b/source/numem/io/endian.d @@ -56,7 +56,7 @@ private { Is no-op if provided endianness is the same as the system's */ -ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) if (isNumeric!T) { +ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) { // Get bytes from value ubyte[T.sizeof] output; @@ -74,6 +74,26 @@ ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) if (isNumeric!T) { return output; } +/** + Flips the bytes in the provided value to be in the specified endianness. + + Is no-op if provided endianness is the same as the system's +*/ +T toEndianReinterpret(T)(T in_, Endianess endianness) { + if (endianness != NATIVE_ENDIAN) { + union tmp { + T value; + ubyte[T.sizeof] bytes; + } + + tmp toConvert; + toConvert.bytes = toEndian!T(in_, endianness); + return toConvert.value; + } + + return in_; +} + /** Gets a value from a different endianness. diff --git a/source/numem/string.d b/source/numem/string.d index d5c884e..36fbc2b 100644 --- a/source/numem/string.d +++ b/source/numem/string.d @@ -8,23 +8,67 @@ module numem.string; import numem.collections.vector; import numem.core; import std.string; +import std.traits; +import core.stdcpp.string; + +/// Gets whether the provided type is some type of string. +enum isSomeString(T) = + isSomeSafeString!T || + isSomeCString!T; + +/** + Gets whether the provided type is some type of string + which is length denoted and therefore "safe" +*/ +enum isSomeSafeString(T) = + isSomeNString!T || + isSomeDString!T; + /// Gets whether the provided type is some type of nstring. enum isSomeNString(T) = - is(T == nstring) || is (T == nwstring) || is(T == ndstring); + is(inout(T) == inout(basic_string!C), C) && isSomeChar!C; /// Gets whether the provided type is some type of null terminated C string. enum isSomeCString(T) = - is(T : inout(char)*) || is(T : inout(wchar)*)|| is(T : inout(dchar)*); + is(inout(T) == inout(C)*, C) && isSomeChar!C; /// Gets whether the provided type is some type of D string slice. enum isSomeDString(T) = - is(T : inout(char)[]) || is(T : inout(wchar)[])|| is(T : inout(dchar)[]); + is(immutable(T) == immutable(C[]), C) && isSomeChar!C; /// Gets whether the provided type is a character enum isSomeChar(T) = is(T == char) || is(T == wchar) || is(T == dchar); +/** + Gets whether [T] is convertible to any form of [nstring] +*/ +enum isStringable(T) = + __traits(hasMember, T, "toString") && + isSomeString!(ReturnType!(T.toString)) && + hasUDA(T.toString, nogc); + +/** + Gets the size of the element in a string-ish type in bytes. +*/ +enum StringCharSize(T) = + StringCharType!T.sizeof; + +/** + Gets the type of the element in a string-ish type. +*/ +template StringCharType(T) { + static if (isSomeString!T) { + static if(isSomeNString!T) + alias StringCharType = T.valueType; + else + alias StringCharType = typeof(T.init[0].init); + } else { + alias StringCharType = void; + } +} + /** Basic string type. @@ -98,6 +142,15 @@ public: this.set_(text); } + /** + Creates a string from a string with a different + encoding. + */ + this(T)(ref auto T rhs) if (isSomeSafeString!T) { + import numem.text.unicode : decode, encode; + this = encode!selfType(decode!T(rhs)); + } + /** Makes a copy of a string */ @@ -521,4 +574,12 @@ unittest { assert(struct_[1].str == "b"); assert(copy.size() == 0); +} + +@("string: encoding-conversion") +unittest { + nwstring wstr = "Hello, world!"w; + nstring str = wstr; + + assert(str == "Hello, world!"); } \ No newline at end of file diff --git a/source/numem/text/encoding.d b/source/numem/text/encoding.d new file mode 100644 index 0000000..1d56dc6 --- /dev/null +++ b/source/numem/text/encoding.d @@ -0,0 +1,101 @@ +module numem.text.encoding; +import numem.string; +import numem.text.ascii; +import numem.text.unicode; +import numem.text.unicode.utf8; +import numem.text.unicode.utf16; + +/** + Currently supported encodings +*/ +enum Encoding { + + /** + Unknown encoding + */ + unknown, + + /** + ASCII + */ + ascii, + + /** + UTF-8 + */ + utf8, + + /** + UTF-16 + */ + utf16, + + /** + UTF-16 w/ BOM + */ + utf16LE, + + /** + UTF-16 w/ BOM + */ + utf16BE, + + /** + + */ + utf32 +} + +/** + Gets the encoding of a run of text. +*/ +Encoding getEncoding(T)(auto ref T str) @nogc if (isSomeString!T) { + static if (StringCharSize!T == 1) { + nstring nstr = str; + + foreach(char c; str[]) { + if (!isASCII(c)) { + if (validate(nstr)) + return Encoding.utf8; + else + return Encoding.unknown; + } + } + return Encoding.ascii; + + } else static if (StringCharSize!T == 2) { + + nwstring nstr = str; + auto bom = getBOM(nstr); + if (bom != 0) { + return bom == 0x0000FEFF ? + Encoding.utf16BE : + Encoding.utf16LE; + } else if (validate(nstr)) { + + return Encoding.utf16; + } + return Encoding.unknown; + + } else static if (StringCharSize!T == 4) { + + return validate(str) ? + Encoding.utf32 : + Encoding.unknown; + } else { + + return Encoding.unknown; + } +} + +@("Get encoding") +unittest { + import std.stdio : writeln; + + assert("Hello, world!".getEncoding() == Encoding.ascii); + assert("あえおう".getEncoding() == Encoding.utf8); + + assert("Hello, world!"w.getEncoding() == Encoding.utf16); + assert("\uFEFFHello, world!"w.getEncoding() == Encoding.utf16BE); + assert("\uFFFEHello, world!"w.getEncoding() == Encoding.utf16LE); +} \ No newline at end of file diff --git a/source/numem/text/package.d b/source/numem/text/package.d index a7bf380..967ecdf 100644 --- a/source/numem/text/package.d +++ b/source/numem/text/package.d @@ -8,4 +8,7 @@ /** Numem text transformation utilities */ -module numem.text; \ No newline at end of file +module numem.text; + +public import numem.text.encoding; +public import numem.text.ascii; \ No newline at end of file diff --git a/source/numem/text/unicode/package.d b/source/numem/text/unicode/package.d index 96f9b48..5d6e2b6 100644 --- a/source/numem/text/unicode/package.d +++ b/source/numem/text/unicode/package.d @@ -5,8 +5,19 @@ Authors: Luna the Foxgirl */ -module numem.unicode; +module numem.text.unicode; import numem.collections.vector; +import numem.io.endian; +import numem.string; + +public import numem.text.unicode.utf8; +public import numem.text.unicode.utf16; +public import numem.text.unicode.utf32; + +// For encoding dispatch +import utf8 = numem.text.unicode.utf8; +import utf16 = numem.text.unicode.utf16; +import utf32 = numem.text.unicode.utf32; @nogc nothrow: @@ -29,6 +40,66 @@ bool hasSurrogatePairs(codepoint code) { return (code >= 0x0000D800 && code <= 0x0000DFFF); } +/** + Gets whether the character is a BOM +*/ +bool isBOM(codepoint c) { + return isLittleEndianBOM(c) || isBigEndianBOM(c); +} + +/** + Gets whether the byte order mark is little endian +*/ +pragma(inline, true) +bool isLittleEndianBOM(codepoint c) { + return (c == 0xFFFE0000 || c == 0x0000FFFE); +} + +/** + Gets whether the byte order mark is big endian +*/ +pragma(inline, true) +bool isBigEndianBOM(codepoint c) { + return (c == 0xFEFF0000 || c == 0x0000FEFF); +} + +/** + Gets the endianess from a BOM +*/ +Endianess getEndianFromBOM(codepoint c) { + return isBigEndianBOM(c) ? + Endianess.bigEndian : + Endianess.littleEndian; +} + +/** + Decodes a string +*/ +UnicodeSequence decode(T)(ref auto T str) if (isSomeSafeString!T) { + static if (StringCharSize!T == 1) + return utf8.decode(str); + static if (StringCharSize!T == 2) + return utf16.decode(str); + static if (StringCharSize!T == 4) + return utf32.decode(str); + else + assert(0, "String type not supported."); +} + +/** + Encodes a string +*/ +T encode(T)(ref auto UnicodeSequence seq) if (isSomeNString!T) { + static if (StringCharSize!T == 1) + return utf8.encode(seq); + static if (StringCharSize!T == 2) + return utf16.encode(seq); + static if (StringCharSize!T == 4) + return utf32.encode(seq); + else + assert(0, "String type not supported."); +} + /** Validates whether the codepoint is within spec */ diff --git a/source/numem/text/unicode/utf16.d b/source/numem/text/unicode/utf16.d index 0740545..6afb964 100644 --- a/source/numem/text/unicode/utf16.d +++ b/source/numem/text/unicode/utf16.d @@ -5,10 +5,12 @@ Authors: Luna the Foxgirl */ -module numem.unicode.utf16; -import numem.unicode; +module numem.text.unicode.utf16; +import numem.text.unicode.utf32; +import numem.text.unicode; import numem.collections.vector; import numem.string; +import numem.io.endian; nothrow @nogc: @@ -36,6 +38,82 @@ bool validate(wchar[2] c) { ((c[0] & utf16_smask) == utf16_lead && ((c[1] & utf16_smask) == utf16_trail)); } +/** + Validates whether the given nwstring is a valid UTF-16 string. + + This function assumes that the string is in machine-native + endianess. +*/ +bool validate(nwstring str) { + return validate(str[]); +} + + +/** + Validates whether the given nwstring is a valid UTF-16 string. + + This function assumes that the string is in machine-native + endianess. +*/ +bool validate(inout(wchar)[] str) { + nwstring tmp = str; + + // Handle endianess. + codepoint bom = getBOM(str); + if (bom != 0 && getEndianFromBOM(bom) != NATIVE_ENDIAN) { + tmp = toMachineOrder(str); + } + + size_t i = 0; + while(i < tmp.length) { + wchar[2] txt; + + // Validate length + size_t clen = getLength(tmp[i]); + if (clen >= i+tmp.length) return false; + if (clen == 0) return false; + + txt[0..clen] = tmp[i..i+clen]; + if (!validate(txt)) return false; + + i += clen; + } + + return true; +} + +/** + Gets the BOM of the nwstring if it has one. + + Otherwise returns a NUL character. +*/ +codepoint getBOM(inout(wchar)[] str) { + if (str.length == 0) + return 0; + + union tmp { + wchar c; + ubyte[2] bytes; + } + tmp tmp_; + tmp_.c = str[0]; + + if (isBOM(cast(codepoint)tmp_.c)) { + return cast(codepoint)tmp_.c; + } + + return 0; +} + +/** + Gets the BOM of the nwstring if it has one. + + Otherwise returns a NUL character. +*/ +codepoint getBOM(nwstring str) { + return getBOM(str[]); +} + /** Gets how many utf-16 units are in the specified character */ @@ -71,10 +149,52 @@ unittest { } /** - Decodes a single utf-16 character + Returns a string which is [str] converted to machine order. + + If the string has no BOM it is assumed it's already in + machine order. +*/ +nwstring toMachineOrder(inout(wchar)[] str) { + + if (str.length == 0) + return nwstring.init; + + codepoint bom = getBOM(str); + Endianess endian = getEndianFromBOM(bom); + if (bom != 0 && endian != NATIVE_ENDIAN) { + + // Flip all the bytes around. + nwstring tmp; + foreach(i, ref const(wchar) c; str) { + tmp ~= c.toEndianReinterpret(endian); + } + return tmp; + } + + // Already local order. + return nwstring(str); +} + +/** + Returns a string which is [str] converted to machine order. + + If the string has no BOM it is assumed it's already in + machine order. +*/ +nwstring toMachineOrder(nwstring str) { + return toMachineOrder(str[]); +} + +/** + Decodes a single utf-16 character, + + Character is assumed to be in the same + endianness as the system! */ codepoint decode(wchar[2] chr, ref size_t read) { + // Handle endianness read = chr[0].getLength(); + switch(read) { default: read = 1; @@ -93,18 +213,59 @@ codepoint decode(wchar[2] chr, ref size_t read) { } /** - Decodes a utf-16 string + Decodes a single utf-16 character from a + nwstring. */ -UnicodeSequence decode(nwstring str) { +codepoint decodeOne(nwstring str, size_t offset = 0) { + if (str.length == 0) + return unicodeReplacementCharacter; + + // Gets the string in the current machine order. + str = str.toMachineOrder(); + + // Get length of first character. + size_t read = getLength(str[0]); + size_t i; + while(i < offset++) { + + // We're out of characters to read. + if (read > str.length) + return unicodeReplacementCharacter; + + read = getLength(str[read]); + } + + // Decode to UTF-32 to avoid duplication + // of effort. + wchar[2] tmp; + tmp[0..read] = str[0..read]; + return decode(tmp, read); +} + +/** + Decodes a UTF-16 string. + + This function will automatically detect BOMs + and handle endianness where applicable. +*/ +UnicodeSequence decode(nwstring str, bool stripBOM = false) { UnicodeSequence code; + // Gets the string in the current machine order. + nwstring tmp = str.toMachineOrder(); size_t i = 0; - while(i < str.size()) { + + // Strip BOM if there is one. + if (stripBOM && getBOM(tmp)) { + i++; + } + + while(i < tmp.size()) { wchar[2] txt; // Validate length, add FFFD if invalid. - size_t clen = str[i].getLength(); - if (clen >= i+str.size() || clen == 0) { + size_t clen = tmp[i].getLength(); + if (clen >= i+tmp.size() || clen == 0) { code ~= unicodeReplacementCharacter; i++; } @@ -128,9 +289,14 @@ unittest { /** Encodes a unicode sequence to UTF-16 */ -nwstring encode(UnicodeSlice slice) { +nwstring encode(UnicodeSlice slice, bool addBOM = false) { nwstring out_; + // Add BOM if requested. + if (addBOM) { + out_ ~= cast(wchar)0xFEFF; + } + size_t i = 0; while(i < slice.length) { wchar[2] txt; diff --git a/source/numem/text/unicode/utf32.d b/source/numem/text/unicode/utf32.d new file mode 100644 index 0000000..fc521f3 --- /dev/null +++ b/source/numem/text/unicode/utf32.d @@ -0,0 +1,122 @@ +/* + Copyright © 2024, Inochi2D Project + Distributed under the 2-Clause BSD License, see LICENSE file. + + Authors: Luna the Foxgirl +*/ + +module numem.text.unicode.utf32; +import numem.text.unicode; +import numem.string; +import numem.io.endian; + +@nogc nothrow: + +/** + Validates a UTF32 codepoint +*/ +bool validate(dchar c) { + return validate(c); +} + +/** + Validates a UTF32 string +*/ +bool validate(ndstring str) { + return validate(str[]); +} + +/** + Validates a UTF32 string +*/ +bool validate(inout(dchar)[] str) { + ndstring tmp = str; + + // Handle endianess. + codepoint bom = getBOM(str); + if (bom != 0 && getEndianFromBOM(bom) != NATIVE_ENDIAN) { + tmp = toMachineOrder(str); + } + + foreach(dchar c; tmp) { + if (!validate(c)) + return false; + } + + return true; +} + +/** + Gets the BOM +*/ +codepoint getBOM(inout(dchar)[] str) { + if (str.length == 0) + return 0; + + // This is UTF32. + if (isBOM(str[0])) + return str[0]; + + return 0; +} + +/** + Returns a string which is [str] converted to machine order. + + If the string has no BOM it is assumed it's already in + machine order. +*/ +ndstring toMachineOrder(inout(dchar)[] str) { + + // Empty string early escape. + if (str.length == 0) + return ndstring.init; + + codepoint bom = getBOM(str); + Endianess endian = getEndianFromBOM(bom); + if (bom != 0 && endian != NATIVE_ENDIAN) { + + // Flip all the bytes around + ndstring tmp; + foreach(i, ref const(dchar) c; str) { + tmp ~= c.toEndianReinterpret(endian); + } + + return tmp; + } + + return ndstring(str); +} + +/** + Decodes a single UTF-32 character +*/ +codepoint decode(dchar c) { + if (!validate(c)) + return unicodeReplacementCharacter; + return c; +} + +/** + Decodes a single UTF-32 string +*/ +nwstring decode(inout(dchar)[] str) { + nwstring tmp; + + foreach(ref c; str) { + tmp ~= cast(wchar)decode(c); + } + + return tmp; +} + +/** + Encodes a UTF-32 string. + + Since UnicodeSequence is already technically + UTF-32 this doesn't do much other than + throw the data into a nwstring. +*/ +nwstring encode(UnicodeSequence sequence) { + return nwstring(cast(wchar[])sequence[0..$]); +} \ No newline at end of file diff --git a/source/numem/text/unicode/utf8.d b/source/numem/text/unicode/utf8.d index fadc298..cd55e8d 100644 --- a/source/numem/text/unicode/utf8.d +++ b/source/numem/text/unicode/utf8.d @@ -5,13 +5,11 @@ Authors: Luna the Foxgirl */ -module numem.unicode.utf8; -import numem.unicode; +module numem.text.unicode.utf8; +import numem.text.unicode; import numem.collections.vector; import numem.string; - -// For some reason D really wants this import. -import numem.unicode : validate; +import numem.text.unicode : validate; @nogc nothrow: @@ -112,16 +110,23 @@ unittest { } /** - Returns whether the specified string is a valid UTF-8 string + Returns whether the given nstring is a valid UTF-8 string */ bool validate(nstring str) { + return validate(str[]); +} + +/** + Returns whether the given nstring is a valid UTF-8 string +*/ +bool validate(inout(char)[] str) { size_t i = 0; - while(i < str.size) { + while(i < str.length) { char[4] txt; // Validate length size_t clen = getLength(str[i]); - if (clen >= i+str.size()) return false; + if (clen >= i+str.length) return false; if (clen == 0) return false; // Validate sequence @@ -276,16 +281,16 @@ unittest { Decodes a string to a vector of codepoints. Invalid codes will be replaced with unicodeReplacementCharacter */ -UnicodeSequence decode(nstring str) { +UnicodeSequence decode(inout(char)[] str) { UnicodeSequence code; size_t i = 0; - while(i < str.size()) { + while(i < str.length) { char[4] txt; // Validate length, add FFFD if invalid. size_t clen = str[i].getLength(); - if (clen >= i+str.size() || clen == 0) { + if (clen >= i+str.length || clen == 0) { code ~= unicodeReplacementCharacter; i++; } @@ -298,6 +303,14 @@ UnicodeSequence decode(nstring str) { return code; } +/** + Decodes a string to a vector of codepoints. + Invalid codes will be replaced with unicodeReplacementCharacter +*/ +UnicodeSequence decode(nstring str) { + return decode(str[]); +} + @("decode: UTF-8 string") unittest { import std.stdio : writeln;