From 225a6938197ea6873ea7fc885c3f9350d86ef5ff Mon Sep 17 00:00:00 2001 From: Luna Date: Wed, 9 Oct 2024 18:37:51 +0200 Subject: [PATCH] More UTF handling for streams --- source/numem/io/endian.d | 26 ++++++++++++++++--- source/numem/io/stream/reader.d | 42 +++++++++++++++++++++++-------- source/numem/io/stream/writer.d | 29 ++++++++++++++++++++- source/numem/text/unicode/utf16.d | 10 +++++--- source/numem/text/unicode/utf32.d | 10 +++++--- 5 files changed, 94 insertions(+), 23 deletions(-) diff --git a/source/numem/io/endian.d b/source/numem/io/endian.d index 15f641f..b9c2dd2 100644 --- a/source/numem/io/endian.d +++ b/source/numem/io/endian.d @@ -7,9 +7,9 @@ module numem.io.endian; import numem.core; import numem.collections.vector; +import numem.string; import std.traits : isNumeric, isIntegral, isBasicType; import std.traits : Unqual; -import std.bitmanip; @nogc nothrow: @@ -53,13 +53,33 @@ private { } } +/** + Converts endianess of string +*/ +@trusted +basic_string!(StringCharType!T) toEndian(T)(T str, Endianess endianess) if (isSomeSafeString!T) { + static if (StringCharSize!T > 1) { + if (endianess != NATIVE_ENDIAN) { + + // Flip all the bytes around + basic_string!(StringCharType!T) tmp; + foreach(i, ref c; str) { + tmp ~= c.toEndianReinterpret(endianess); + } + + return tmp; + } + } + return basic_string!(StringCharType!T)(str); +} + /** Converts a value to an array of the specified endianness. Is no-op if provided endianness is the same as the system's */ @trusted -ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) { +ubyte[T.sizeof] toEndian(T)(T value, Endianess endianess) if (isBasicType!T) { union tmp { Unqual!T value; ubyte[T.sizeof] bytes; @@ -69,7 +89,7 @@ ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) { tmp_.value = value; // Swap endianness if neccesary - if (endianness != NATIVE_ENDIAN) { + if (endianess != NATIVE_ENDIAN) { ubyte[] slice = tmp_.bytes[0..$]; swapEndian(slice); } diff --git a/source/numem/io/stream/reader.d b/source/numem/io/stream/reader.d index 603cc30..9f4fccf 100644 --- a/source/numem/io/stream/reader.d +++ b/source/numem/io/stream/reader.d @@ -53,31 +53,51 @@ public: /// Ditto @trusted - int read(T)(ref T val, size_t length) if (isSomeSafeString!T && StringCharSize!T == 1) { - if (length > val.length) - return -1; + int read(T)(ref T val, size_t length = 0) if (isSomeSafeString!T) { + import numem.text.unicode; - alias type = StringCharType!(T)*; + alias CharType = StringCharType!(T); + + // User wants to read it all + if (length == 0) + length = val.length; - // TODO: Handle encoding schemes other than UTF8 + // No out of bounds reads allowed. + if (length > val.length) + return -1; - // Size of a single unit - vector!ubyte tmp = vector!ubyte(length*StringCharSize!T); + // Vector to read bytes into. + vector!ubyte tmp = vector!ubyte(CharType.sizeof*length); // Attempt reading data - int r = cast(int)stream.read(tmp, 0, length); + int r = cast(int)stream.read(tmp, 0, tmp.length); if (r < 0) return r; + // Reinterpret the data. + CharType[] reinterpreted = (cast(CharType*)tmp.ptr)[0..length]; + + static if (CharType.sizeof > 1) { + + // If there's a BOM that will take precedence. + val = reinterpreted.toEndian(endian); + } else { + + (cast(CharType*)val.ptr)[0..length] = reinterpreted[0..length]; + } - // "Convert" the data via type punning. - (cast(type)val.ptr)[0..length] = (cast(type)tmp.ptr)[0..length]; return r; } /// Ditto @trusted - int read(T)(ref T val, size_t length) if (isSomeVector!T) { + int read(T)(ref T val, size_t length = 0) if (isSomeVector!T) { + + // User wants to read it all + if (length == 0) + length = val.length; + + // No out of bounds reads allowed. if (length > val.length) return -1; diff --git a/source/numem/io/stream/writer.d b/source/numem/io/stream/writer.d index c84e804..dbafbec 100644 --- a/source/numem/io/stream/writer.d +++ b/source/numem/io/stream/writer.d @@ -46,7 +46,12 @@ public: /// Ditto @trusted void write(T)(T val) if (isSomeSafeString!T) { - stream.write(cast(ubyte[])val[0..$]); + static if (StringCharSize!T > 1) { + auto data = val.toEndian(endian); + stream.write((cast(ubyte*)data.ptr)[0..data.length*StringCharSize!T]); + } else { + stream.write(cast(ubyte[])val[0..$]); + } } /// Ditto @@ -129,4 +134,26 @@ unittest { ulong val; reader.read!ulong(val); assert(val == MAGIC); +} + +@("RW: UTF-32") +unittest { + import numem.io.stream.memstream : MemoryStream; + import numem.io.stream.reader : StreamReader; + alias TestReader = StreamReader!(Endianess.bigEndian); + alias TestWriter = StreamWriter!(Endianess.bigEndian); + + ubyte[128] buffer; + auto stream = new MemoryStream(buffer.ptr, buffer.length); + auto writer = new TestWriter(stream); + auto reader = new TestReader(stream); + + enum MAGIC = "Hello, world!"d; + ndstring val = ndstring(MAGIC.length); + + writer.write(MAGIC); + stream.seek(0); + + reader.read(val); + assert(val == MAGIC); } \ No newline at end of file diff --git a/source/numem/text/unicode/utf16.d b/source/numem/text/unicode/utf16.d index 07cab1c..332ff18 100644 --- a/source/numem/text/unicode/utf16.d +++ b/source/numem/text/unicode/utf16.d @@ -151,17 +151,19 @@ unittest { /** Returns a string which is [str] converted to machine order. - If the string has no BOM it is assumed it's already in - machine order. + If the string has no BOM the specified fallback endian will be used. */ -nwstring toMachineOrder(inout(wchar)[] str) { +nwstring toMachineOrder(inout(wchar)[] str, Endianess fallbackEndian = NATIVE_ENDIAN) { if (str.length == 0) return nwstring.init; codepoint bom = getBOM(str); Endianess endian = getEndianFromBOM(bom); - if (bom != 0 && endian != NATIVE_ENDIAN) { + if (bom == 0) + endian = fallbackEndian; + + if (endian != NATIVE_ENDIAN) { // Flip all the bytes around. nwstring tmp; diff --git a/source/numem/text/unicode/utf32.d b/source/numem/text/unicode/utf32.d index 1204b3c..6f2ea1e 100644 --- a/source/numem/text/unicode/utf32.d +++ b/source/numem/text/unicode/utf32.d @@ -66,10 +66,9 @@ codepoint getBOM(inout(dchar)[] str) { /** Returns a string which is [str] converted to machine order. - If the string has no BOM it is assumed it's already in - machine order. + If the string has no BOM the specified fallback endian will be used. */ -ndstring toMachineOrder(inout(dchar)[] str) { +ndstring toMachineOrder(inout(dchar)[] str, Endianess fallbackEndian = NATIVE_ENDIAN) { // Empty string early escape. if (str.length == 0) @@ -77,7 +76,10 @@ ndstring toMachineOrder(inout(dchar)[] str) { codepoint bom = getBOM(str); Endianess endian = getEndianFromBOM(bom); - if (bom != 0 && endian != NATIVE_ENDIAN) { + if (bom == 0) + endian = fallbackEndian; + + if (endian != NATIVE_ENDIAN) { // Flip all the bytes around ndstring tmp;