Add UTF-32, BOM and string conversion support

Inochi2D · Oct 9, 2024 · 001e5a6 · 001e5a6
1 parent 5821b1a
commit 001e5a6
Show file tree

Hide file tree

Showing 10 changed files with 591 additions and 45 deletions.
diff --git a/source/numem/conv.d b/source/numem/conv.d
@@ -9,11 +9,9 @@
     Utilities for converting between some basic types
 */
 module numem.conv;
-import numem.all;
 import core.stdc.stdlib;
 import std.traits;
-import numem.core.exception;
-import numem.format;
+import numem.all;
 
 @nogc:
 
@@ -148,6 +146,10 @@ nstring toString(T)(T item) if (__traits(hasMember, T, "toNString")) {
     return item.toNString();
 }
 
+nwstring toUTF16(T)(ref auto T str) if (isSomeString!T) {
+
+}
+
 @("toString")
 unittest {
     assert((32u).toString() == "32");

diff --git a/source/numem/format.d b/source/numem/format.d
@@ -4,27 +4,14 @@ import numem.text.ascii;
 import numem.conv;
 import numem.collections;
 
-import std.traits;
+import std.traits : isBasicType;
 
 private {
-    enum CanConvertToNString(T) =
-        __traits(hasMember, T, "toNString") &&
-        is(T.toNString : nstring function()) &&
-        hasUDA(T.toNString, nogc);
-
-    enum CanConvertToDString(T) =
-        __traits(hasMember, T, "toString") &&
-        is(T.toNString : string function()) &&
-        hasUDA(T.toNString, nogc);
-
     nstring _formatSingle(T)(T element) {
-        static if(CanConvertToNString!T) {
-
-            return element.toNString();
-        } else static if(CanConvertToDString!T) {
+        static if(isStringable!T) {
 
             return nstring(element.toString());
-        } else static if (is(T : string)) {
+        } else static if (isSomeString!T) {
 
             return nstring(element);
         } else static if (isBasicType!T) {

diff --git a/source/numem/io/endian.d b/source/numem/io/endian.d
@@ -56,7 +56,7 @@ private {
 
     Is no-op if provided endianness is the same as the system's
 */
-ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) if (isNumeric!T) {
+ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) {
 
     // Get bytes from value
     ubyte[T.sizeof] output;
@@ -74,6 +74,26 @@ ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) if (isNumeric!T) {
     return output;
 }
 
+/**
+    Flips the bytes in the provided value to be in the specified endianness.
+
+    Is no-op if provided endianness is the same as the system's
+*/
+T toEndianReinterpret(T)(T in_, Endianess endianness) {
+    if (endianness != NATIVE_ENDIAN) {
+        union tmp {
+            T value;
+            ubyte[T.sizeof] bytes;
+        }
+
+        tmp toConvert;
+        toConvert.bytes = toEndian!T(in_, endianness);
+        return toConvert.value;
+    }
+
+    return in_;
+}
+
 /**
     Gets a value from a different endianness.
 

diff --git a/source/numem/string.d b/source/numem/string.d
@@ -8,23 +8,67 @@ module numem.string;
 import numem.collections.vector;
 import numem.core;
 import std.string;
+import std.traits;
+import core.stdcpp.string;
+
+/// Gets whether the provided type is some type of string.
+enum isSomeString(T) =
+    isSomeSafeString!T ||
+    isSomeCString!T;
+
+/**
+    Gets whether the provided type is some type of string
+    which is length denoted and therefore "safe"
+*/
+enum isSomeSafeString(T) =
+    isSomeNString!T ||
+    isSomeDString!T;
+
 
 /// Gets whether the provided type is some type of nstring.
 enum isSomeNString(T) = 
-    is(T == nstring) || is (T == nwstring) || is(T == ndstring);
+    is(inout(T) == inout(basic_string!C), C) && isSomeChar!C;
 
 /// Gets whether the provided type is some type of null terminated C string.
 enum isSomeCString(T) =
-    is(T : inout(char)*) || is(T : inout(wchar)*)|| is(T : inout(dchar)*);
+    is(inout(T) == inout(C)*, C) && isSomeChar!C;
 
 /// Gets whether the provided type is some type of D string slice.
 enum isSomeDString(T) =
-    is(T : inout(char)[]) || is(T : inout(wchar)[])|| is(T : inout(dchar)[]);
+    is(immutable(T) == immutable(C[]), C) && isSomeChar!C;
 
 /// Gets whether the provided type is a character
 enum isSomeChar(T) =
     is(T == char) || is(T == wchar) || is(T == dchar);
 
+/**
+    Gets whether [T] is convertible to any form of [nstring]
+*/
+enum isStringable(T) = 
+    __traits(hasMember, T, "toString") &&
+    isSomeString!(ReturnType!(T.toString)) &&
+    hasUDA(T.toString, nogc);
+
+/**
+    Gets the size of the element in a string-ish type in bytes.
+*/
+enum StringCharSize(T) =
+    StringCharType!T.sizeof;
+
+/**
+    Gets the type of the element in a string-ish type.
+*/
+template StringCharType(T) {
+    static if (isSomeString!T) {
+        static if(isSomeNString!T)
+            alias StringCharType = T.valueType;
+        else
+            alias StringCharType = typeof(T.init[0].init);
+    } else {
+        alias StringCharType = void;
+    }
+}
+
 /**
     Basic string type.
 
@@ -98,6 +142,15 @@ public:
         this.set_(text);
     }
 
+    /**
+        Creates a string from a string with a different
+        encoding.
+    */
+    this(T)(ref auto T rhs) if (isSomeSafeString!T) {
+        import numem.text.unicode : decode, encode;
+        this = encode!selfType(decode!T(rhs));
+    }
+
     /**
         Makes a copy of a string
     */
@@ -521,4 +574,12 @@ unittest {
     assert(struct_[1].str == "b");
 
     assert(copy.size() == 0);
+}
+
+@("string: encoding-conversion")
+unittest {
+    nwstring wstr = "Hello, world!"w;
+    nstring str = wstr;
+
+    assert(str == "Hello, world!");
 }
diff --git a/source/numem/text/encoding.d b/source/numem/text/encoding.d
@@ -0,0 +1,101 @@
+module numem.text.encoding;
+import numem.string;
+import numem.text.ascii;
+import numem.text.unicode;
+import numem.text.unicode.utf8;
+import numem.text.unicode.utf16;
+
+/**
+    Currently supported encodings
+*/
+enum Encoding {
+
+    /**
+        Unknown encoding
+    */
+    unknown,
+
+    /**
+        ASCII
+    */
+    ascii,
+
+    /**
+        UTF-8
+    */
+    utf8,
+
+    /**
+        UTF-16
+    */
+    utf16,
+
+    /**
+        UTF-16 w/ BOM
+    */
+    utf16LE,
+
+    /**
+        UTF-16 w/ BOM
+    */
+    utf16BE,
+
+    /**
+    
+    */
+    utf32
+}
+
+/**
+    Gets the encoding of a run of text.
+*/
+Encoding getEncoding(T)(auto ref T str) @nogc if (isSomeString!T) {
+    static if (StringCharSize!T == 1) {
+        nstring nstr = str;
+
+        foreach(char c; str[]) {
+            if (!isASCII(c)) {
+                if (validate(nstr))
+                    return Encoding.utf8;
+                else
+                    return Encoding.unknown;
+            }
+        }
+        return Encoding.ascii;
+
+    } else static if (StringCharSize!T == 2) {
+
+        nwstring nstr = str;
+        auto bom = getBOM(nstr);
+        if (bom != 0) {
+            return bom == 0x0000FEFF ? 
+                Encoding.utf16BE : 
+                Encoding.utf16LE;
+        } else if (validate(nstr)) {
+
+            return Encoding.utf16;
+        }
+        return Encoding.unknown;
+
+    } else static if (StringCharSize!T == 4) {
+
+        return validate(str) ? 
+            Encoding.utf32 : 
+            Encoding.unknown;
+    } else {
+
+        return Encoding.unknown;
+    }
+} 
+
+@("Get encoding")
+unittest {
+    import std.stdio : writeln;
+
+    assert("Hello, world!".getEncoding() == Encoding.ascii);
+    assert("あえおう".getEncoding() == Encoding.utf8);
+
+    assert("Hello, world!"w.getEncoding() == Encoding.utf16);
+    assert("\uFEFFHello, world!"w.getEncoding() == Encoding.utf16BE);
+    assert("\uFFFEHello, world!"w.getEncoding() == Encoding.utf16LE);
+}
diff --git a/source/numem/text/package.d b/source/numem/text/package.d
@@ -8,4 +8,7 @@
 /**
     Numem text transformation utilities
 */
-module numem.text;
+module numem.text;
+
+public import numem.text.encoding;
+public import numem.text.ascii;
diff --git a/source/numem/text/unicode/package.d b/source/numem/text/unicode/package.d
@@ -5,8 +5,19 @@
     Authors: Luna the Foxgirl
 */
 
-module numem.unicode;
+module numem.text.unicode;
 import numem.collections.vector;
+import numem.io.endian;
+import numem.string;
+
+public import numem.text.unicode.utf8;
+public import numem.text.unicode.utf16;
+public import numem.text.unicode.utf32;
+
+// For encoding dispatch
+import utf8 = numem.text.unicode.utf8;
+import utf16 = numem.text.unicode.utf16;
+import utf32 = numem.text.unicode.utf32;
 
 @nogc nothrow:
 
@@ -29,6 +40,66 @@ bool hasSurrogatePairs(codepoint code) {
     return (code >= 0x0000D800 && code <= 0x0000DFFF);
 }
 
+/**
+    Gets whether the character is a BOM
+*/
+bool isBOM(codepoint c) {
+    return isLittleEndianBOM(c) || isBigEndianBOM(c); 
+}
+
+/**
+    Gets whether the byte order mark is little endian
+*/
+pragma(inline, true)
+bool isLittleEndianBOM(codepoint c) {
+    return (c == 0xFFFE0000 || c == 0x0000FFFE);
+}
+
+/**
+    Gets whether the byte order mark is big endian
+*/
+pragma(inline, true)
+bool isBigEndianBOM(codepoint c) {
+    return (c == 0xFEFF0000 || c == 0x0000FEFF);
+}
+
+/**
+    Gets the endianess from a BOM
+*/
+Endianess getEndianFromBOM(codepoint c) {
+    return isBigEndianBOM(c) ? 
+        Endianess.bigEndian : 
+        Endianess.littleEndian;
+}
+
+/**
+    Decodes a string
+*/
+UnicodeSequence decode(T)(ref auto T str) if (isSomeSafeString!T) {
+    static if (StringCharSize!T == 1)
+        return utf8.decode(str);
+    static if (StringCharSize!T == 2)
+        return utf16.decode(str);
+    static if (StringCharSize!T == 4)
+        return utf32.decode(str);
+    else
+        assert(0, "String type not supported.");
+}
+
+/**
+    Encodes a string
+*/
+T encode(T)(ref auto UnicodeSequence seq) if (isSomeNString!T) {
+    static if (StringCharSize!T == 1)
+        return utf8.encode(seq);
+    static if (StringCharSize!T == 2)
+        return utf16.encode(seq);
+    static if (StringCharSize!T == 4)
+        return utf32.encode(seq);
+    else
+        assert(0, "String type not supported.");
+}
+
 /**
     Validates whether the codepoint is within spec
 */