diff --git a/config/libtm.gyp b/config/libtm.gyp index eae4061c..523b963a 100644 --- a/config/libtm.gyp +++ b/config/libtm.gyp @@ -502,7 +502,9 @@ '<(runtime_path)/tm_random.c', '<(runtime_path)/tm_deflate.c', '<(runtime_path)/tm_str.c', - '<(runtime_path)/tm_utf8.c' + '<(runtime_path)/tm_utf7.c', + '<(runtime_path)/tm_utf8.c', + '<(runtime_path)/tm_utf16.c', ], "include_dirs": [ '<(runtime_path)/', diff --git a/src/colony/lua/colony-node.lua b/src/colony/lua/colony-node.lua index 1274d9a4..a5559240 100644 --- a/src/colony/lua/colony-node.lua +++ b/src/colony/lua/colony-node.lua @@ -186,7 +186,7 @@ function from_base64(to_decode) local char = string.sub(to_decode, i, i) local offset, _ = string.find(index_table, char) if offset == nil then - error(js_new(global.Error, "Invalid character '" .. char .. "' found.")) + return '' end bit_pattern = bit_pattern .. string.sub(to_binary(offset-1), 3) @@ -298,32 +298,24 @@ local buffer_proto = js_obj({ if encoding == nil then encoding = 'utf8' end - encoding = string.lower(encoding); + encoding = string.lower(encoding) - local buf = tm.buffer_tobytestring(getmetatable(this).buffer, offset, endOffset); - + local buf = tm.buffer_tobytestring(getmetatable(this).buffer, offset, endOffset) if encoding == 'binary' then - return string.gsub(buf, '[\128-\255]', function (c) - -- original value must be converted to internal encoding - return global.String.fromCharCode(nil, string.byte(c)) - end) + return tm.str_from_binary(buf); elseif encoding == 'ascii' then - -- simply strips high bit from original value - return string.gsub(buf, '[\128-\255]', function (c) - return string.char(string.byte(c) - 128) - end) + return tm.str_from_ascii(buf); elseif encoding == 'utf8' or encoding == 'utf-8' then return tm.str_from_utf8(buf); + elseif encoding == 'ucs2' or encoding == 'ucs-2' or encoding == 'utf16le' or encoding == 'utf-16le' then + return tm.str_from_utf16le(buf); elseif encoding == 'base64' then - return to_base64(buf); + return tm.str_from_binary(to_base64(buf)); elseif encoding == 'hex' then local str = string.gsub(buf, '(.)', function (c) return string.format('%02x', string.byte(c)); end) return str; - elseif encoding == 'ucs2' or encoding == 'ucs-2' - or encoding == 'utf16le' or encoding == 'utf-16le' then - return error(js_new(global.NotImplementedError, 'Encoding not implemented yet: ' + encoding)); else error(js_new(global.TypeError, 'Unknown encoding: ' + encoding)); end @@ -493,56 +485,62 @@ function _of_buffer (this, buf, length) end local function Buffer (this, arg, encoding) - -- args - local str, length = '', 0 - if type(arg) == 'number' then - length = tonumber(arg) - elseif type(arg) == 'string' then - str = arg - length = #arg - else - str = arg or '' - length = arg and arg.length or 0 + if encoding == nil then + encoding = 'utf8' end - - -- encoding first check - if type(str) == 'string' and encoding == 'base64' then - -- "base64" string - str = from_base64(str) - length = string.len(str) - elseif type(str) == 'string' and encoding == 'hex' then - if string.len(str) % 2 ~= 0 then + encoding = string.lower(encoding) + + local raw, arr, hex, size + if type(arg) == 'number' then + size = arg + elseif type(arg) ~= 'string' then + -- assume an array + arr = arg + elseif encoding == 'binary' then + raw = tm.str_to_binary(arg) + elseif encoding == 'ascii' then + raw = tm.str_to_ascii(arg) + elseif encoding == 'utf8' or encoding == 'utf-8' then + raw = tm.str_to_utf8(arg) + elseif encoding == 'ucs2' or encoding == 'ucs-2' or encoding == 'utf16le' or encoding == 'utf-16le' then + raw = tm.str_to_utf16le(arg) + elseif encoding == 'base64' then + raw = from_base64(tm.str_to_binary(arg)) + elseif encoding == 'hex' then + if string.len(arg) % 2 ~= 0 then error(js_new(global.TypeError, 'Invalid hex string.')) end - -- Remove first occurrance of invalid char until end of string - str = string.lower(string.gsub(str, '[^a-fA-F0-9].*', '')) - length = string.len(str) / 2 + hex = string.lower(string.gsub(arg, '[^a-fA-F0-9].*', '')) + else + error(js_new(global.TypeError, 'Unknown encoding: ' + encoding)); + end + + if type(size) == 'number' then + -- all set + elseif arr then + size = arr.length + elseif hex then + size = #hex / 2 + else + size = #raw end - + this = {} - local buf = tm.buffer_create(length) - _of_buffer(this, buf, length) - - -- Lua internally uses a "binary" encoding, that is, - -- operates on (1-indexable) 8-bit values. - - if type(str) == 'string' and encoding == 'hex' then - -- "hex" string - for i = 1, #str, 2 do - this[(i - 1)/2] = tonumber(string.sub(str, i, i+1), 16) + local buf = tm.buffer_create(size) + _of_buffer(this, buf, size) + if arr then + for i = 1, size do + this[i - 1] = arr[i - 1] end - elseif type(str) == 'string' then - -- "binary" string - for i = 1, #str do - this[i - 1] = string.byte(str, i) + elseif hex then + for i = 1, #hex, 2 do + this[(i - 1)/2] = tonumber(string.sub(hex, i, i+1), 16) end - else - -- array - for i = 1, str.length do - this[i - 1] = str[i - 1] + elseif raw then + for i = 1, size do + this[i - 1] = string.byte(raw, i) end end - return this end diff --git a/src/colony/lua_http_parser.c b/src/colony/lua_http_parser.c index 14111d8d..85f7e8dd 100644 --- a/src/colony/lua_http_parser.c +++ b/src/colony/lua_http_parser.c @@ -110,7 +110,7 @@ static int lhttp_parser_on_url(http_parser *p, const char *at, size_t length) { return 0; }; /* Push the string argument */ - lua_pushlstring(L, at, length); + colony_pushbuffer(L, (const uint8_t*) at, length); lua_call(L, 1, 1); @@ -132,7 +132,7 @@ static int lhttp_parser_on_header_field(http_parser *p, const char *at, size_t l return 0; }; /* Push the string argument */ - lua_pushlstring(L, at, length); + colony_pushbuffer(L, (const uint8_t*) at, length); lua_call(L, 1, 1); @@ -154,7 +154,7 @@ static int lhttp_parser_on_header_value(http_parser *p, const char *at, size_t l return 0; }; /* Push the string argument */ - lua_pushlstring(L, at, length); + colony_pushbuffer(L, (const uint8_t*) at, length); lua_call(L, 1, 1); @@ -176,7 +176,7 @@ static int lhttp_parser_on_body(http_parser *p, const char *at, size_t length) { return 0; }; /* Push the string argument */ - lua_pushlstring(L, at, length); + colony_pushbuffer(L, (const uint8_t*) at, length); lua_call(L, 1, 1); diff --git a/src/colony/lua_tm.c b/src/colony/lua_tm.c index 32117ddb..a442e122 100644 --- a/src/colony/lua_tm.c +++ b/src/colony/lua_tm.c @@ -71,7 +71,7 @@ const char* colony_tolutf8 (lua_State* L, int index, size_t* res_len) return lua_tolstring(L, index, res_len); } -inline const char* colony_toutf8 (lua_State* L, int index) +const char* colony_toutf8 (lua_State* L, int index) { return colony_tolutf8(L, index, NULL); } @@ -85,7 +85,7 @@ void colony_pushlutf8 (lua_State* L, const char* utf8, size_t utf8_len) if (str != utf8) free((char*) str); } -inline void colony_pushutf8 (lua_State* L, const char* utf8) +void colony_pushutf8 (lua_State* L, const char* utf8) { colony_pushlutf8(L, utf8, strlen(utf8)); } @@ -922,20 +922,85 @@ static int l_tm_fs_dir_close (lua_State* L) return 1; } +static int l_tm_str_to_ascii (lua_State* L) +{ + const uint8_t* raw; + size_t str_len; + const char* str = lua_tolstring(L, 1, &str_len); + size_t raw_len = tm_str_to_ascii((const uint8_t*) str, str_len + 1, &raw) - 1; // compensate for NUL byte at end + lua_pushlstring(L, (const char*) raw, raw_len); + if ((void*) raw != (void*) str) free((uint8_t*) raw); + return 1; +} + +static int l_tm_str_from_ascii (lua_State* L) +{ + const char* str; + size_t raw_len; + const char* raw = lua_tolstring(L, 1, &raw_len); + size_t str_len = tm_str_from_ascii((const uint8_t*) raw, raw_len, (const uint8_t**) &str); + lua_pushlstring(L, str, str_len); + if ((void*) str != (void*) raw) free((char*) str); + return 1; +} + +static int l_tm_str_to_binary (lua_State* L) +{ + const uint8_t* raw; + size_t str_len; + const char* str = lua_tolstring(L, 1, &str_len); + size_t raw_len = tm_str_to_binary((const uint8_t*) str, str_len + 1, &raw) - 1; // compensate for NUL byte at end + lua_pushlstring(L, (const char*) raw, raw_len); + if ((void*) raw != (void*) str) free((uint8_t*) raw); + return 1; +} + +static int l_tm_str_from_binary (lua_State* L) +{ + const char* str; + size_t raw_len; + const char* raw = lua_tolstring(L, 1, &raw_len); + size_t str_len = tm_str_from_binary((const uint8_t*) raw, raw_len, (const uint8_t**) &str); + lua_pushlstring(L, str, str_len); + if ((void*) str != (void*) raw) free((char*) str); + return 1; +} + +static int l_tm_str_to_utf16le (lua_State* L) +{ + const uint8_t* raw; + size_t str_len; + const char* str = lua_tolstring(L, 1, &str_len); + size_t raw_len = tm_str_to_utf16((const uint8_t*) str, str_len + 1, &raw, LE) - 1; // compensate for NUL byte at end + lua_pushlstring(L, (const char*) raw, raw_len); + if ((void*) raw != (void*) str) free((uint8_t*) raw); + return 1; +} + +static int l_tm_str_from_utf16le (lua_State* L) +{ + const char* str; + size_t raw_len; + const char* raw = lua_tolstring(L, 1, &raw_len); + size_t str_len = tm_str_from_utf16((const uint8_t*) raw, raw_len, (const uint8_t**) &str, LE); + lua_pushlstring(L, str, str_len); + if ((void*) str != (void*) raw) free((char*) str); + return 1; +} static int l_tm_str_to_utf8 (lua_State* L) { - size_t utf8_len; - const char* utf8 = colony_tolutf8(L, 1, &utf8_len); - lua_pushlstring(L, utf8, utf8_len); + size_t raw_len; + const char* raw = colony_tolutf8(L, 1, &raw_len); + lua_pushlstring(L, raw, raw_len); return 1; } static int l_tm_str_from_utf8 (lua_State* L) { - size_t utf8_len; - const char* utf8 = lua_tolstring(L, 1, &utf8_len); - colony_pushlutf8(L, utf8, utf8_len); + size_t raw_len; + const char* raw = lua_tolstring(L, 1, &raw_len); + colony_pushlutf8(L, raw, raw_len); return 1; } @@ -1389,10 +1454,16 @@ LUALIB_API int luaopen_tm (lua_State *L) { "fs_dir_read", l_tm_fs_dir_read }, { "fs_dir_close", l_tm_fs_dir_close }, - // unicode + // encodings { "str_to_utf8", l_tm_str_to_utf8 }, { "str_from_utf8", l_tm_str_from_utf8 }, - + { "str_to_utf16le", l_tm_str_to_utf16le }, + { "str_from_utf16le", l_tm_str_from_utf16le }, + { "str_to_binary", l_tm_str_to_binary }, + { "str_from_binary", l_tm_str_from_binary }, + { "str_to_ascii", l_tm_str_to_ascii }, + { "str_from_ascii", l_tm_str_from_ascii }, + // internal string manipulation { "str_codeat", l_tm_str_codeat }, { "str_fromcode", l_tm_str_fromcode }, diff --git a/src/colony/modules/http.js b/src/colony/modules/http.js index 8e3441fb..c9cb6b40 100644 --- a/src/colony/modules/http.js +++ b/src/colony/modules/http.js @@ -127,11 +127,13 @@ function IncomingMessage (type, socket) { self.url = url; }), onHeaderField: parserCallback(function (field) { + field = field.toString(); var arr = (self._headersComplete) ? self.rawTrailers : self.rawHeaders; if (arr.length + 1 > self._maxRawHeaders) return; arr.push(field); }), onHeaderValue: parserCallback(function (value) { + value = value.toString(); var arr = (self._headersComplete) ? self.rawTrailers : self.rawHeaders, key = arr[arr.length - 1].toLowerCase(); if (arr.length + 1 > self._maxRawHeaders) return; diff --git a/src/tm.h b/src/tm.h index 3bd09e4d..9dacab1a 100644 --- a/src/tm.h +++ b/src/tm.h @@ -196,14 +196,27 @@ uint32_t tm_uptime_micro (); double tm_timestamp (); int tm_timestamp_update (double millis); -// BUFFER + +// ENDIANNESS + +#include "order32.h" typedef enum { BE = 0, LE } tm_endian_t; -// UNICODE +#define TM_ENDIAN_HOST (O32_HOST_ORDER == O32_BIG_ENDIAN ? BE : LE) +#define TM_ENDIAN_SWAP64(e, x) ((e != TM_ENDIAN_HOST) ? __builtin_bswap64(x) : x) +#define TM_ENDIAN_SWAP32(e, x) ((e != TM_ENDIAN_HOST) ? __builtin_bswap32(x) : x) +#define TM_ENDIAN_SWAP16(e, x) ((e != TM_ENDIAN_HOST) ? __builtin_bswap16(x) : x) + +// BUFFER + +void tm_buffer_float_write (uint8_t* buf, size_t index, float value, tm_endian_t endianness); +void tm_buffer_double_write (uint8_t* buf, size_t index, double value, tm_endian_t endianness); + +// ENCODINGS (UNICODE / ASCII / BINARY) #define TM_UTF8_DECODE_ERROR UINT32_MAX size_t tm_utf8_decode(const uint8_t* buf, size_t buf_len, uint32_t* uc); @@ -211,6 +224,16 @@ size_t tm_utf8_encode(uint8_t* buf, size_t buf_len, uint32_t uc); size_t tm_str_to_utf8 (const uint8_t* buf, size_t buf_len, const uint8_t **dstptr); size_t tm_str_from_utf8 (const uint8_t* buf, size_t buf_len, const uint8_t **dstptr); +size_t tm_str_to_utf16 (const uint8_t* buf, size_t buf_len, const uint8_t **dstptr, tm_endian_t endianness); +size_t tm_str_from_utf16 (const uint8_t* buf, size_t buf_len, const uint8_t **dstptr, tm_endian_t endianness); + +size_t tm_str_to_ascii (const uint8_t* buf, size_t buf_len, const uint8_t **dstptr); +size_t tm_str_from_ascii (const uint8_t* buf, size_t buf_len, const uint8_t **dstptr); + +size_t tm_str_to_binary (const uint8_t* buf, size_t buf_len, const uint8_t **dstptr); +size_t tm_str_from_binary (const uint8_t* buf, size_t buf_len, const uint8_t **dstptr); + + // INTERNAL STRING MANIPULATION uint32_t tm_str_codeat (const uint8_t* buf, size_t buf_len, size_t index); diff --git a/src/tm_utf16.c b/src/tm_utf16.c new file mode 100644 index 00000000..d6c7b48e --- /dev/null +++ b/src/tm_utf16.c @@ -0,0 +1,40 @@ +#include + +#include "tm.h" + +// NOTE: Ideally these would deal with native uint16_t arrays, and have separate uint16_t<->uint8_t endian helper. +// But it doesn't seem worth the extra pain and potential performance hit right now. + +size_t tm_str_to_utf16 (const uint8_t* buf, size_t buf_len, const uint8_t ** const dstptr, tm_endian_t endianness) { + uint16_t* utf16 = calloc(buf_len, 2); // NOTE: we know utf16 will be this size or less + size_t utf16_len = 0; + + size_t buf_pos = 0; + while (buf_pos < buf_len) { + uint32_t uchar; + buf_pos += tm_utf8_decode(buf + buf_pos, buf_len - buf_pos, &uchar); + assert(uchar != TM_UTF8_DECODE_ERROR); // internal strings should never be malformed, 0xFFFD replacement increases length + assert(uchar < 0x10000); // internal strings should only include BMP codepoints + utf16[utf16_len] = TM_ENDIAN_SWAP16(endianness, (uint16_t) uchar); + utf16_len += 1; + } + *dstptr = (uint8_t*) utf16; + return (utf16_len << 1) - 1; // include only single null *byte* (for consistency with others) +} + +size_t tm_str_from_utf16 (const uint8_t* _utf16, size_t _utf16_len, const uint8_t ** const dstptr, tm_endian_t endianness) { + const uint16_t* utf16 = (const uint16_t*) _utf16; + size_t utf16_len = _utf16_len >> 1; + + uint8_t* buf = calloc(utf16_len, 3); // each incoming codepoint could require up to 3 bytes to represent + + size_t buf_pos = 0; + size_t utf16_pos = 0; + while (utf16_pos < utf16_len) { + uint16_t uchar = TM_ENDIAN_SWAP16(endianness, utf16[utf16_pos]); + buf_pos += tm_utf8_encode(buf + buf_pos, 3, uchar); + utf16_pos += 1; + } + *dstptr = buf; + return buf_pos; +} diff --git a/src/tm_utf7.c b/src/tm_utf7.c new file mode 100644 index 00000000..56e59de5 --- /dev/null +++ b/src/tm_utf7.c @@ -0,0 +1,53 @@ +#include + +#include "tm.h" + +size_t _tm_str_to_8bit (const uint8_t* buf, size_t buf_len, const uint8_t ** const dstptr, uint8_t mask) { + uint8_t* ascii_buf = malloc(buf_len); // NOTE: we know ascii will be this size or less + size_t ascii_len = 0; + + size_t buf_pos = 0; + while (buf_pos < buf_len) { + uint32_t uchar; + buf_pos += tm_utf8_decode(buf + buf_pos, buf_len - buf_pos, &uchar); + assert(uchar != TM_UTF8_DECODE_ERROR); // internal strings should never be malformed, 0xFFFD replacement increases length + assert(uchar < 0x10000); // internal strings should only include BMP codepoints + ascii_buf[ascii_len] = (uint8_t) uchar & mask; + ascii_len += 1; + } + *dstptr = ascii_buf; + return ascii_len; +} + +size_t tm_str_to_ascii (const uint8_t* buf, size_t buf_len, const uint8_t ** const dstptr) { + return _tm_str_to_8bit(buf, buf_len, dstptr, 0xFF); // yes 0xFF, despite node.js doc insinuation! +} + +size_t tm_str_from_ascii (const uint8_t* ascii_buf, size_t ascii_len, const uint8_t ** const dstptr) { + uint8_t* buf = malloc(ascii_len); + + size_t pos = 0; + while (pos < ascii_len) { + buf[pos] = ascii_buf[pos] & 0x7F; + ++pos; + } + *dstptr = buf; + return pos; +} + +size_t tm_str_to_binary (const uint8_t* buf, size_t buf_len, const uint8_t ** const dstptr) { + return _tm_str_to_8bit(buf, buf_len, dstptr, 0xFF); +} + +size_t tm_str_from_binary (const uint8_t* binary, size_t binary_len, const uint8_t ** const dstptr) { + uint8_t* str = calloc(binary_len, 2); // NOTE: size could at most double if every incoming byte is > 127 + + size_t str_pos = 0; + size_t binary_pos = 0; + while (binary_pos < binary_len) { + str_pos += tm_utf8_encode(str + str_pos, 2, binary[binary_pos]); + binary_pos += 1; + } + *dstptr = str; + return str_pos; +} diff --git a/src/tm_utf8.c b/src/tm_utf8.c index 01410a3e..9de95e28 100644 --- a/src/tm_utf8.c +++ b/src/tm_utf8.c @@ -71,7 +71,7 @@ size_t tm_str_to_utf8 (const uint8_t* buf, size_t buf_len, const uint8_t ** cons size_t buf_pos = 0; while (buf_pos < buf_len) { uint32_t uchar; - buf_pos += tm_utf8_decode(buf + buf_pos, buf_len - buf_pos, &uchar); + buf_pos += tm_utf8_decode(buf + buf_pos, buf_len - buf_pos, &uchar); assert(uchar != TM_UTF8_DECODE_ERROR); // internal strings should never be malformed, 0xFFFD replacement increases length // NOTE: this follows new behavior http://blog.nodejs.org/2014/06/16/openssl-and-breaking-utf-8-change/ if (hchar) { diff --git a/test/suite/buffer.js b/test/suite/buffer.js index 852baf34..31129d05 100644 --- a/test/suite/buffer.js +++ b/test/suite/buffer.js @@ -1,6 +1,6 @@ var tap = require('../tap'); -tap.count(70); +tap.count(96); function arreq (a, b) { if (a.length != b.length) { @@ -164,10 +164,43 @@ console.log('#', new Buffer('hello world').toString('hex')) console.log('#', new Buffer(new Buffer('hello world').toString('hex'), 'hex')) var b = new Buffer([0, 0x41, 0x82, 0x104]); -tap.eq(b.toString('binary'), "\u0000\u0041\u0082\u0004"); -tap.eq(b.toString('ascii'), "\u0000\u0041\u0002\u0004"); -tap.eq(b.toString('utf8'), "\u0000\u0041\uFFFD\u0004"); -//tap.eq(b.toString('utf16le'), "\u4100\u0482"); +tap.eq(b.length, 4, "array ingested"); +tap.eq(b[0], 0x00); +tap.eq(b[1], 0x41); +tap.eq(b[2], 0x82); +tap.eq(b[3], 0x04); +tap.eq(b.toString('binary'), "\u0000\u0041\u0082\u0004", "binary toString"); +tap.eq(b.toString('binary').length, 4); +tap.eq(b.toString('ascii'), "\u0000\u0041\u0002\u0004", "ascii toString"); +tap.eq(b.toString('ascii').length, 4); +tap.eq(b.toString('utf8'), "\u0000\u0041\uFFFD\u0004", "utf8 toString"); +tap.eq(b.toString('utf8').length, 4); +tap.eq(b.toString('utf16le'), "\u4100\u0482", "utf16le toString"); +tap.eq(b.toString('utf16le').length, 2); +tap.eq(b.toString('ucs2').length, 2); +tap.eq(b.toString('base64'), "AEGCBA==", "base64 toString"); +tap.eq(b.toString('base64').length, 8); +tap.eq(b.toString('hex'), "00418204", "hex toString"); +tap.eq(b.toString('base64').length, 8); + +tap.eq(Buffer("\u8182", 'utf8')[2], 0x82, "buffer from utf8"); +tap.eq(Buffer("\u8182", 'utf8').length, 3); +tap.eq(Buffer("\u8182", 'ascii')[0], 0x82, "buffer from ascii"); +tap.eq(Buffer("\u8182", 'ascii').length, 1); +tap.eq(Buffer("\u8182", 'binary')[0], 0x82, "buffer from binary"); +tap.eq(Buffer("\u8182", 'binary').length, 1); +tap.eq(Buffer("\u8182", 'utf16le')[1], 0x81, "buffer from utf16le"); +tap.eq(Buffer("\u8182", 'utf16le').length, 2); +tap.eq(Buffer("\u8182", 'ucs2')[1], 0x81); +tap.eq(Buffer("\u8182", 'base64').length, 0, "buffer from [bad] base64"); +var threw; +try { + Buffer("\u8182", 'hex'); +} catch (e) { + threw = e; +} +tap.ok(threw, "buffer from [bad] hex"); + // write var buf = new Buffer(256);