Skip to content

Commit

Permalink
Add UTF-32, BOM and string conversion support
Browse files Browse the repository at this point in the history
  • Loading branch information
LunaTheFoxgirl committed Oct 9, 2024
1 parent 5821b1a commit 001e5a6
Show file tree
Hide file tree
Showing 10 changed files with 591 additions and 45 deletions.
8 changes: 5 additions & 3 deletions source/numem/conv.d
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
Utilities for converting between some basic types
*/
module numem.conv;
import numem.all;
import core.stdc.stdlib;
import std.traits;
import numem.core.exception;
import numem.format;
import numem.all;

@nogc:

Expand Down Expand Up @@ -148,6 +146,10 @@ nstring toString(T)(T item) if (__traits(hasMember, T, "toNString")) {
return item.toNString();
}

nwstring toUTF16(T)(ref auto T str) if (isSomeString!T) {

}

@("toString")
unittest {
assert((32u).toString() == "32");
Expand Down
19 changes: 3 additions & 16 deletions source/numem/format.d
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,14 @@ import numem.text.ascii;
import numem.conv;
import numem.collections;

import std.traits;
import std.traits : isBasicType;

private {
enum CanConvertToNString(T) =
__traits(hasMember, T, "toNString") &&
is(T.toNString : nstring function()) &&
hasUDA(T.toNString, nogc);

enum CanConvertToDString(T) =
__traits(hasMember, T, "toString") &&
is(T.toNString : string function()) &&
hasUDA(T.toNString, nogc);

nstring _formatSingle(T)(T element) {
static if(CanConvertToNString!T) {

return element.toNString();
} else static if(CanConvertToDString!T) {
static if(isStringable!T) {

return nstring(element.toString());
} else static if (is(T : string)) {
} else static if (isSomeString!T) {

return nstring(element);
} else static if (isBasicType!T) {
Expand Down
22 changes: 21 additions & 1 deletion source/numem/io/endian.d
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ private {
Is no-op if provided endianness is the same as the system's
*/
ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) if (isNumeric!T) {
ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) {

// Get bytes from value
ubyte[T.sizeof] output;
Expand All @@ -74,6 +74,26 @@ ubyte[T.sizeof] toEndian(T)(T value, Endianess endianness) if (isNumeric!T) {
return output;
}

/**
Flips the bytes in the provided value to be in the specified endianness.
Is no-op if provided endianness is the same as the system's
*/
T toEndianReinterpret(T)(T in_, Endianess endianness) {
if (endianness != NATIVE_ENDIAN) {
union tmp {
T value;
ubyte[T.sizeof] bytes;
}

tmp toConvert;
toConvert.bytes = toEndian!T(in_, endianness);
return toConvert.value;
}

return in_;
}

/**
Gets a value from a different endianness.
Expand Down
67 changes: 64 additions & 3 deletions source/numem/string.d
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,67 @@ module numem.string;
import numem.collections.vector;
import numem.core;
import std.string;
import std.traits;
import core.stdcpp.string;

/// Gets whether the provided type is some type of string.
enum isSomeString(T) =
isSomeSafeString!T ||
isSomeCString!T;

/**
Gets whether the provided type is some type of string
which is length denoted and therefore "safe"
*/
enum isSomeSafeString(T) =
isSomeNString!T ||
isSomeDString!T;


/// Gets whether the provided type is some type of nstring.
enum isSomeNString(T) =
is(T == nstring) || is (T == nwstring) || is(T == ndstring);
is(inout(T) == inout(basic_string!C), C) && isSomeChar!C;

/// Gets whether the provided type is some type of null terminated C string.
enum isSomeCString(T) =
is(T : inout(char)*) || is(T : inout(wchar)*)|| is(T : inout(dchar)*);
is(inout(T) == inout(C)*, C) && isSomeChar!C;

/// Gets whether the provided type is some type of D string slice.
enum isSomeDString(T) =
is(T : inout(char)[]) || is(T : inout(wchar)[])|| is(T : inout(dchar)[]);
is(immutable(T) == immutable(C[]), C) && isSomeChar!C;

/// Gets whether the provided type is a character
enum isSomeChar(T) =
is(T == char) || is(T == wchar) || is(T == dchar);

/**
Gets whether [T] is convertible to any form of [nstring]
*/
enum isStringable(T) =
__traits(hasMember, T, "toString") &&
isSomeString!(ReturnType!(T.toString)) &&
hasUDA(T.toString, nogc);

/**
Gets the size of the element in a string-ish type in bytes.
*/
enum StringCharSize(T) =
StringCharType!T.sizeof;

/**
Gets the type of the element in a string-ish type.
*/
template StringCharType(T) {
static if (isSomeString!T) {
static if(isSomeNString!T)
alias StringCharType = T.valueType;
else
alias StringCharType = typeof(T.init[0].init);
} else {
alias StringCharType = void;
}
}

/**
Basic string type.
Expand Down Expand Up @@ -98,6 +142,15 @@ public:
this.set_(text);
}

/**
Creates a string from a string with a different
encoding.
*/
this(T)(ref auto T rhs) if (isSomeSafeString!T) {
import numem.text.unicode : decode, encode;
this = encode!selfType(decode!T(rhs));
}

/**
Makes a copy of a string
*/
Expand Down Expand Up @@ -521,4 +574,12 @@ unittest {
assert(struct_[1].str == "b");

assert(copy.size() == 0);
}

@("string: encoding-conversion")
unittest {
nwstring wstr = "Hello, world!"w;
nstring str = wstr;

assert(str == "Hello, world!");
}
101 changes: 101 additions & 0 deletions source/numem/text/encoding.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
module numem.text.encoding;
import numem.string;
import numem.text.ascii;
import numem.text.unicode;
import numem.text.unicode.utf8;
import numem.text.unicode.utf16;

/**
Currently supported encodings
*/
enum Encoding {

/**
Unknown encoding
*/
unknown,

/**
ASCII
*/
ascii,

/**
UTF-8
*/
utf8,

/**
UTF-16
*/
utf16,

/**
UTF-16 w/ BOM
*/
utf16LE,

/**
UTF-16 w/ BOM
*/
utf16BE,

/**
*/
utf32
}

/**
Gets the encoding of a run of text.
*/
Encoding getEncoding(T)(auto ref T str) @nogc if (isSomeString!T) {
static if (StringCharSize!T == 1) {
nstring nstr = str;

foreach(char c; str[]) {
if (!isASCII(c)) {
if (validate(nstr))
return Encoding.utf8;
else
return Encoding.unknown;
}
}
return Encoding.ascii;

} else static if (StringCharSize!T == 2) {

nwstring nstr = str;
auto bom = getBOM(nstr);
if (bom != 0) {
return bom == 0x0000FEFF ?
Encoding.utf16BE :
Encoding.utf16LE;
} else if (validate(nstr)) {

return Encoding.utf16;
}
return Encoding.unknown;

} else static if (StringCharSize!T == 4) {

return validate(str) ?
Encoding.utf32 :
Encoding.unknown;
} else {

return Encoding.unknown;
}
}

@("Get encoding")
unittest {
import std.stdio : writeln;

assert("Hello, world!".getEncoding() == Encoding.ascii);
assert("あえおう".getEncoding() == Encoding.utf8);

assert("Hello, world!"w.getEncoding() == Encoding.utf16);
assert("\uFEFFHello, world!"w.getEncoding() == Encoding.utf16BE);
assert("\uFFFEHello, world!"w.getEncoding() == Encoding.utf16LE);
}
5 changes: 4 additions & 1 deletion source/numem/text/package.d
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@
/**
Numem text transformation utilities
*/
module numem.text;
module numem.text;

public import numem.text.encoding;
public import numem.text.ascii;
73 changes: 72 additions & 1 deletion source/numem/text/unicode/package.d
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,19 @@
Authors: Luna the Foxgirl
*/

module numem.unicode;
module numem.text.unicode;
import numem.collections.vector;
import numem.io.endian;
import numem.string;

public import numem.text.unicode.utf8;
public import numem.text.unicode.utf16;
public import numem.text.unicode.utf32;

// For encoding dispatch
import utf8 = numem.text.unicode.utf8;
import utf16 = numem.text.unicode.utf16;
import utf32 = numem.text.unicode.utf32;

@nogc nothrow:

Expand All @@ -29,6 +40,66 @@ bool hasSurrogatePairs(codepoint code) {
return (code >= 0x0000D800 && code <= 0x0000DFFF);
}

/**
Gets whether the character is a BOM
*/
bool isBOM(codepoint c) {
return isLittleEndianBOM(c) || isBigEndianBOM(c);
}

/**
Gets whether the byte order mark is little endian
*/
pragma(inline, true)
bool isLittleEndianBOM(codepoint c) {
return (c == 0xFFFE0000 || c == 0x0000FFFE);
}

/**
Gets whether the byte order mark is big endian
*/
pragma(inline, true)
bool isBigEndianBOM(codepoint c) {
return (c == 0xFEFF0000 || c == 0x0000FEFF);
}

/**
Gets the endianess from a BOM
*/
Endianess getEndianFromBOM(codepoint c) {
return isBigEndianBOM(c) ?
Endianess.bigEndian :
Endianess.littleEndian;
}

/**
Decodes a string
*/
UnicodeSequence decode(T)(ref auto T str) if (isSomeSafeString!T) {
static if (StringCharSize!T == 1)
return utf8.decode(str);
static if (StringCharSize!T == 2)
return utf16.decode(str);
static if (StringCharSize!T == 4)
return utf32.decode(str);
else
assert(0, "String type not supported.");
}

/**
Encodes a string
*/
T encode(T)(ref auto UnicodeSequence seq) if (isSomeNString!T) {
static if (StringCharSize!T == 1)
return utf8.encode(seq);
static if (StringCharSize!T == 2)
return utf16.encode(seq);
static if (StringCharSize!T == 4)
return utf32.encode(seq);
else
assert(0, "String type not supported.");
}

/**
Validates whether the codepoint is within spec
*/
Expand Down
Loading

0 comments on commit 001e5a6

Please sign in to comment.