-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a047fdb
commit bfa754d
Showing
1 changed file
with
166 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
module numem.unicode.utf8; | ||
import numem.unicode; | ||
import numem.mem.string; | ||
|
||
private { | ||
enum utf8_datamask(uint offset) = 0xFF >> offset; | ||
enum utf8_leadmask(uint offset) = ~utf8_datamask!offset; | ||
|
||
struct utf8_t { | ||
ubyte mask; | ||
ubyte lead; | ||
int bits; | ||
} | ||
|
||
// Lookup table containing the correct byte patterns and codepoints for each | ||
// utf8 codepoint size. | ||
const utf8_t[5] utf8_lookup = [ | ||
utf8_t(0b00111111, 0b10000000, 6), // Continuation byte | ||
utf8_t(0b01111111, 0b00000000, 7), // Lead byte (1 byte) | ||
utf8_t(0b00011111, 0b11000000, 5), // Lead byte (2 bytes) | ||
utf8_t(0b00001111, 0b11100000, 4), // Lead byte (3 bytes) | ||
utf8_t(0b00000111, 0b11110000, 3), // Lead byte (4 bytes) | ||
|
||
]; | ||
|
||
// UTF-8 Well-Formed Byte Sequence Table | ||
// A translation of Table 3-7 in the unicode conformance documents. | ||
const ubyte[2][4][9] utf8_wfbseqtable = [ | ||
[[0x00, 0x7F], [0x00, 0xFF], [0x00, 0xFF], [0x00, 0xFF]], | ||
[[0xC2, 0xDF], [0x80, 0xBF], [0x00, 0xFF], [0x00, 0xFF]], | ||
[[0xE0, 0xE0], [0xA0, 0xBF], [0x80, 0xBF], [0x00, 0xFF]], | ||
[[0xE1, 0xEC], [0x80, 0xBF], [0x80, 0xBF], [0x00, 0xFF]], | ||
[[0xED, 0xED], [0x80, 0x9F], [0x80, 0xBF], [0x00, 0xFF]], | ||
[[0xEE, 0xEF], [0x80, 0xBF], [0x80, 0xBF], [0x00, 0xFF]], | ||
[[0xF0, 0xF0], [0x90, 0xBF], [0x80, 0xBF], [0x80, 0xBF]], | ||
[[0xF1, 0xF3], [0x80, 0xBF], [0x80, 0xBF], [0x80, 0xBF]], | ||
[[0xF4, 0xF4], [0x80, 0x8F], [0x80, 0xBF], [0x80, 0xBF]], | ||
]; | ||
} | ||
|
||
/** | ||
Validates a utf-8 character sequence. | ||
*/ | ||
bool validate(const(char)[4] seq) { | ||
|
||
// Validate and get length. | ||
size_t len = getLength(seq[0]); | ||
if (!len) return false; | ||
|
||
switch(len) { | ||
default: return false; | ||
|
||
// | ||
case 1: | ||
bool condition = | ||
(seq[0] >= utf8_wfbseqtable[0][0][0] && seq[0] <= utf8_wfbseqtable[0][0][1]); | ||
if (condition) return true; | ||
return false; | ||
|
||
case 2: | ||
bool condition = | ||
(seq[0] >= utf8_wfbseqtable[1][0][0] && seq[0] <= utf8_wfbseqtable[1][0][1]) && | ||
(seq[1] >= utf8_wfbseqtable[1][1][0] && seq[1] <= utf8_wfbseqtable[1][1][1]); | ||
if (condition) return true; | ||
return false; | ||
|
||
case 3: | ||
static foreach(tableIdx; 2..6) { | ||
|
||
// Codegen scope shenanigans | ||
{ | ||
bool condition = | ||
(seq[0] >= utf8_wfbseqtable[tableIdx][0][0] && seq[0] <= utf8_wfbseqtable[tableIdx][0][1]) && | ||
(seq[1] >= utf8_wfbseqtable[tableIdx][1][0] && seq[1] <= utf8_wfbseqtable[tableIdx][1][1]) && | ||
(seq[2] >= utf8_wfbseqtable[tableIdx][2][0] && seq[2] <= utf8_wfbseqtable[tableIdx][2][1]); | ||
if (condition) return true; | ||
} | ||
} | ||
return false; | ||
|
||
case 4: | ||
static foreach(tableIdx; 6..9) { | ||
|
||
// Codegen scope shenanigans | ||
{ | ||
bool condition = | ||
(seq[0] >= utf8_wfbseqtable[tableIdx][0][0] && seq[0] <= utf8_wfbseqtable[tableIdx][0][1]) && | ||
(seq[1] >= utf8_wfbseqtable[tableIdx][1][0] && seq[1] <= utf8_wfbseqtable[tableIdx][1][1]) && | ||
(seq[2] >= utf8_wfbseqtable[tableIdx][2][0] && seq[2] <= utf8_wfbseqtable[tableIdx][2][1]) && | ||
(seq[3] >= utf8_wfbseqtable[tableIdx][3][0] && seq[3] <= utf8_wfbseqtable[tableIdx][3][1]); | ||
if (condition) return true; | ||
} | ||
} | ||
return false; | ||
} | ||
} | ||
|
||
@("UTF-8 byte seq validation") | ||
unittest { | ||
|
||
assert( validate([0x24, 0x00, 0x00, 0x00])); | ||
assert( validate([0xF4, 0x80, 0x83, 0x92])); | ||
|
||
assert(!validate([0xC0, 0xAF, 0x00, 0x00])); | ||
assert(!validate([0xE0, 0x9F, 0x80, 0x00])); | ||
} | ||
|
||
/** | ||
Returns whether the specified string is a valid UTF-8 string | ||
*/ | ||
bool validate(nstring str) { | ||
size_t i = 0; | ||
while(i < str.size) { | ||
char[4] txt; | ||
|
||
// Validate length | ||
size_t clen = getLength(str[i]); | ||
if (clen >= i+str.size()) return false; | ||
if (clen == 0) return false; | ||
|
||
// Validate sequence | ||
txt[0..clen] = str[i..i+clen]; | ||
if (!validate(txt)) return false; | ||
|
||
// iteration | ||
i += clen; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
@("UTF-8 string validation") | ||
unittest { | ||
|
||
assert( validate(nstring("Hello, world!"))); | ||
assert( validate(nstring("こんにちは世界!"))); | ||
|
||
// Invalid sequence test | ||
assert(!validate(nstring([0xC1, 0xBF, 0xCC]))); | ||
assert(!validate(nstring([0xF4, 0x9F, 0xBF, 0xBF]))); | ||
assert(!validate(nstring([0xF4, 0x80]))); // Sequence is cut off | ||
} | ||
|
||
/** | ||
Gets the expected byte-size of the specified character | ||
Returns 0 on malformed leading byte | ||
*/ | ||
size_t getLength(char c) { | ||
static foreach_reverse(i; 1..utf8_lookup.length) { | ||
if ((c & utf8_leadmask!i) == utf8_lookup[i].lead) { | ||
return i; | ||
} | ||
} | ||
|
||
// Malformed leading byte | ||
return 0; | ||
} | ||
|
||
@("UTF-8 char len") | ||
unittest { | ||
assert('a'.getLength == 1); | ||
assert((0b11110000).getLength == 4); | ||
assert((0xC0).getLength() == 2); | ||
assert((0b10010101).getLength() == 0); // Malformed leading byte | ||
} |