diff --git a/CMakeLists.txt b/CMakeLists.txt index 41e9f36..39e43cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,6 @@ ADD_LIBRARY(entities STATIC entities.c ) - # Build unit cases ADD_EXECUTABLE(test-entities t-entities.c diff --git a/entities.c b/entities.c index 30888bb..def75fa 100644 --- a/entities.c +++ b/entities.c @@ -8,9 +8,10 @@ #include "entities.h" #include -#include #include #include +#include /* LONG_MAX */ +#include /* isspace() */ #define UNICODE_MAX 0x10FFFFul @@ -239,10 +240,10 @@ static const char *const NAMED_ENTITIES[][2] = { { "sub;", "⊂" }, { "sube;", "⊆" }, { "sum;", "∑" }, - { "sup;", "⊃" }, { "sup1;", "¹" }, { "sup2;", "²" }, { "sup3;", "³" }, + { "sup;", "⊃" }, { "supe;", "⊇" }, { "szlig;", "ß" }, { "tau;", "τ" }, @@ -276,7 +277,7 @@ static const char *const NAMED_ENTITIES[][2] = { static int cmp(const void *key, const void *value) { return strncmp((const char *)key, *(const char *const *)value, - strlen(*(const char *const *)value)); + strlen(*(const char *const *)value)); // strlen? } static const char *get_named_entity(const char *name) @@ -325,7 +326,7 @@ static size_t putc_utf8(unsigned long cp, char *buffer) return 0; } -static bool parse_entity( +static _Bool parse_entity( const char *current, char **to, const char **from) { const char *end = strchr(current, ';'); @@ -335,13 +336,13 @@ static bool parse_entity( { char *tail = NULL; int errno_save = errno; - bool hex = current[2] == 'x' || current[2] == 'X'; + _Bool hex = current[2] == 'x' || current[2] == 'X'; errno = 0; unsigned long cp = strtoul( current + (hex ? 3 : 2), &tail, hex ? 16 : 10); - bool fail = errno || tail != end || cp > UNICODE_MAX; + _Bool fail = errno || tail != end || cp > UNICODE_MAX; errno = errno_save; if(fail) return 0; @@ -363,6 +364,7 @@ static bool parse_entity( return 1; } + size_t decode_html_entities_utf8(char *dest, const char *src) { if(!src) src = dest; @@ -391,3 +393,291 @@ size_t decode_html_entities_utf8(char *dest, const char *src) return (size_t)(to - dest); } + +static _Bool parse_entity_wo_unsafe_symbols( + const char *current, char **to, const char **from, + const char* unsafe_symbs) +{ + const char *end = strchr(current, ';'); + if(!end) return 0; + + if(current[1] == '#') + { + char *tail = NULL; + int errno_save = errno; + _Bool hex = current[2] == 'x' || current[2] == 'X'; + + errno = 0; + unsigned long cp = strtoul( + current + (hex ? 3 : 2), &tail, hex ? 16 : 10); + + _Bool fail = errno || tail != end || cp > UNICODE_MAX; + errno = errno_save; + if(fail) return 0; + + + // *to += putc_utf8(cp, *to); + size_t utf8_symb_len = putc_utf8(cp, *to); + + size_t unsafe_symbs_len; + for (const char* unsafe_symb = unsafe_symbs; (unsafe_symbs_len = strlen(unsafe_symb)) != 0; unsafe_symb += (unsafe_symbs_len + 1)) + { + if (utf8_symb_len == unsafe_symbs_len && strncmp(*to, unsafe_symb, utf8_symb_len) == 0) + { + // rollback + size_t html_entities_len = (size_t)(end - current) + 1; + memmove(*to, current, html_entities_len); + utf8_symb_len = html_entities_len; + break; + } + } + + *to += utf8_symb_len; + *from = end + 1; + + return 1; + } + + const char *entity = get_named_entity(¤t[1]); + if(!entity) return 0; + + size_t len = strlen(entity); + memcpy(*to, entity, len); + + *to += len; + *from = end + 1; + + return 1; +} + +size_t decode_html_entities_utf8_wo_unsafe_symbols(char *dest, const char *src, + const char* unsafe_symbs) +{ + if(!src) src = dest; + + char *to = dest; + const char *from = src; + + for(const char *current; (current = strchr(from, '&'));) + { + memmove(to, from, (size_t)(current - from)); + to += current - from; + + if(parse_entity_wo_unsafe_symbols(current, &to, &from, unsafe_symbs)) + continue; + + from = current; + *to++ = *from++; + } + + size_t remaining = strlen(from); + + memmove(to, from, remaining); + to += remaining; + *to = 0; + + return (size_t)(to - dest); +} + + + +static int strcmp_n(const char *lhs, size_t lhs_len, const char *rhs) +{ + size_t rhs_len = strlen(rhs); + + if (rhs_len > lhs_len) + return rhs[lhs_len]; + else if (rhs_len < lhs_len) + return lhs[rhs_len]; + + // equal len + int i = 0; + while (lhs[i] == rhs[i] && lhs_len > 0){ + ++i; + --lhs_len; + } + + return lhs_len == 0 ? 0 : lhs[i] - rhs[i]; +} + +static size_t clang_is_good = 0; + +static int cmp_n(const void *key, const void *value) +{ + return strcmp_n((const char *)key, clang_is_good, + (const char *)value); +} + +static const char *get_named_entity_n(const char *name, size_t name_len) +{ + clang_is_good = name_len; + const char *const *entity = (const char *const *)bsearch(name, + NAMED_ENTITIES, sizeof NAMED_ENTITIES / sizeof *NAMED_ENTITIES, + sizeof *NAMED_ENTITIES, cmp_n); + + return entity ? entity[1] : NULL; +} + +static const char* strchr_n(const char* src, size_t src_size, int chr) +{ + size_t i; + for (i = 0; i < src_size && src[i] != '\0'; ++i) + { + if ((int)src[i] == chr) + { + break; + } + } + + return i < src_size ? &src[i] : NULL; +} + +/*https://stackoverflow.com/questions/7457163/what-is-the-implementation-of-strtol*/ +static unsigned long +strtoul_n(const char *restrict nptr, size_t nptr_len, char **restrict endptr, int base) { + const char *p = nptr, *endp; + _Bool overflow = 0; + /* Need long long so (LONG_MAX) can fit in these: */ + long long n = 0UL, cutoff; + int cutlim; + if (base < 0 || base == 1 || base > 36) { +#ifdef EINVAL /* errno value defined by POSIX */ + errno = EINVAL; +#endif + return 0L; + } + endp = nptr; + while (nptr_len > 0 && isspace(*p)){ + p++; + --nptr_len; + } + + if (base == 0) + base = 10; + + cutoff = LONG_MAX / base; + cutlim = LONG_MAX % base; + while (nptr_len > 0) { + int c; + if (*p >= 'A') + c = ((*p - 'A') & (~('a' ^ 'A'))) + 10; + else if (*p <= '9') + c = *p - '0'; + else + break; + if (c < 0 || c >= base) break; + endp = ++p; + if (overflow) { + /* endptr should go forward and point to the non-digit character + * (of the given base); required by ANSI standard. */ + if (endptr) continue; + break; + } + if (n > cutoff || (n == cutoff && c > cutlim)) { + overflow = 1; continue; + } + n = n * base + c; + --nptr_len; + } + + if (endptr) *endptr = (char *)endp; + if (overflow) { + errno = ERANGE; + return LONG_MAX; + } + + return (unsigned long)n; +} + +static _Bool parse_entity_wo_unsafe_symbols_n( + const char *current, size_t* curr_size, + char **to, const char **from, + const char* unsafe_symbs) +{ + const char *end = strchr_n(current, *curr_size, ';'); + if(!end) return 0; + + // *curr_size should be more than 3 (start symb, # and ;) + size_t entity_len = (end - current); + if(entity_len > 3 && current[1] == '#') + { + char *tail = NULL; + int errno_save = errno; + _Bool hex = entity_len > 4 && (current[2] == 'x' || current[2] == 'X'); + + errno = 0; + unsigned long cp = strtoul_n( + current + (hex ? 3 : 2), *curr_size - (hex ? 3 : 2), &tail, hex ? 16 : 10); + + _Bool fail = errno || tail != end || cp > UNICODE_MAX; + errno = errno_save; + + + if(fail) return 0; + + size_t utf8_symb_len = putc_utf8(cp, *to); + + size_t unsafe_symbs_len; + for (const char* unsafe_symb = unsafe_symbs; (unsafe_symbs_len = strlen(unsafe_symb)) != 0; unsafe_symb += (unsafe_symbs_len + 1)) + { + if (utf8_symb_len == unsafe_symbs_len && strncmp(*to, unsafe_symb, utf8_symb_len) == 0) + { + // rollback + size_t html_entities_len = (size_t)(end - current) + 1; + memmove(*to, current, html_entities_len); + utf8_symb_len = html_entities_len; + break; + } + } + + *to += utf8_symb_len; + *from = end + 1; + + *curr_size -= end - current + 1; + return 1; + } + + if (*curr_size < 2) + return 0; + + const char *entity = get_named_entity_n(¤t[1], *curr_size - 1); + if(!entity) return 0; + + size_t len = strlen(entity); + memcpy(*to, entity, len); + + *to += len; + *from = end + 1; + *curr_size -= end - current + 1; + + return 1; +} + +size_t decode_html_entities_utf8_wo_unsafe_symbols_n(char *dest, const char *src, + size_t src_size, const char* unsafe_symbs) +{ + if(!src) src = dest; + + char *to = dest; + const char *from = src; + + for(const char *current; (current = strchr_n(from, src_size, '&'));) + { + memmove(to, from, (size_t)(current - from)); + to += current - from; + src_size -= current - from; + + if(parse_entity_wo_unsafe_symbols_n(current, &src_size, &to, &from, unsafe_symbs)) + continue; + + from = current; + src_size -= current - from; + *to++ = *from++; + src_size -= 1; + } + + memmove(to, from, src_size); + to += src_size; + + return (size_t)(to - dest); +} \ No newline at end of file diff --git a/entities.h b/entities.h index d8e58ef..5342705 100644 --- a/entities.h +++ b/entities.h @@ -17,8 +17,23 @@ extern size_t decode_html_entities_utf8(char *dest, const char *src); If is , input will be taken from , decoding the entities in-place. - The function returns the length of the decoded string. + The function returns the size of the decoded string. */ -#endif +extern size_t decode_html_entities_utf8_wo_unsafe_symbols(char *dest, const char *src, + const char* unsafe_symbs); +/* Takes one more params string delimited '\0' of anscii characters + that prevented to decode. Ends by double '\0' + +*/ + +extern size_t decode_html_entities_utf8_wo_unsafe_symbols_n(char *dest, const char *src, + size_t src_size, const char* unsafe_symbs); +/* Takes one more params that indicated + how many characters must be decode i.e. size of + + may be not null terminated! +*/ + +#endif // DECODE_HTML_ENTITIES_UTF8_ diff --git a/t-entities.c b/t-entities.c index 6dd8c34..2c34871 100644 --- a/t-entities.c +++ b/t-entities.c @@ -12,9 +12,13 @@ #undef NDEBUG #include +#include +#include int main(void) { + setlocale(LC_ALL, ""); + { static const char SAMPLE[] = "Christoph Gärtner"; static char buffer[] = "Christoph Gärtner"; @@ -22,14 +26,84 @@ int main(void) assert(strcmp(buffer, SAMPLE) == 0); } + { - static const char SAMPLE[] = "test@example.org"; - static const char INPUT[] = "test@example.org"; + static const char SAMPLE[] = "<!-- i want to inject xss script -->alert(\"nice\")"; + // do not convert symbols /, >, <, -, and ! to prevent xss + static const char INPUT[] = "<!-- i want to inject xss script -->alert("nice")"; // > static char buffer[sizeof INPUT]; - assert(decode_html_entities_utf8(buffer, INPUT) == sizeof SAMPLE - 1); + assert(decode_html_entities_utf8_wo_unsafe_symbols(buffer, INPUT, "/\0>\0<\0!\0-\0\0") == sizeof SAMPLE - 1); + // decode_html_entities_utf8_wo_unsafe(buffer, INPUT); printf("%s", buffer); assert(strcmp(buffer, SAMPLE) == 0); } + + { + static const char SAMPLE[] = ">П>авел>"; + // do not convert symbols /, >, <, -, and ! to prevent xss + static const char INPUT[] = ">П>авел>"; // > + static char buffer[sizeof INPUT]; + assert(decode_html_entities_utf8_wo_unsafe_symbols(buffer, INPUT, "/\0>\0<\0!\0-\0\0") == sizeof SAMPLE - 1); + // decode_html_entities_utf8_wo_unsafe(buffer, INPUT); printf("%s", buffer); + assert(strcmp(buffer, SAMPLE) == 0); + } + + + { + static const char SAMPLE[] = "!>><--<><>!-/></-->!<!-- -->"; + // do not convert symbols /, >, <, -, and ! to prevent xss + static const char INPUT[] = "!>><--<><>!-/></-->!<!-- -->"; // > + static char buffer[sizeof INPUT]; + assert(decode_html_entities_utf8_wo_unsafe_symbols(buffer, INPUT, "/\0>\0<\0!\0-\0\0") == sizeof SAMPLE - 1); + // decode_html_entities_utf8_wo_unsafe(buffer, INPUT); printf("%s", buffer); + assert(strcmp(buffer, SAMPLE) == 0); + } + + { + static const char SAMPLE[] = "&#;П"; + // do not convert symbols /, >, <, -, and ! to prevent xss + static const char INPUT[] = "&#;П>авел>"; // > + const size_t buf_len = 10; // "&#;" -- bad string + char not_null_term_buf[buf_len]; + + size_t temp_buf_true_len = decode_html_entities_utf8_wo_unsafe_symbols_n(not_null_term_buf, INPUT, buf_len, "/\0>\0<\0!\0-\0\0"); + // printf("%d %.*s\n", (int)temp_buf_true_len, temp_buf_true_len, not_null_term_buf); + assert(temp_buf_true_len == sizeof SAMPLE - 1); + assert(strncmp(not_null_term_buf, SAMPLE, temp_buf_true_len) == 0); + } + + { + static const char SAMPLE[] = "&#-2;�ABCDEFGHJCLMNOP123456789;"; + // do not convert symbols /, >, <, -, and ! to prevent xss + static const char INPUT[] = "&#-2;�ABCDEFGHJCLMNOP123456789;"; // > + size_t buf_len = sizeof INPUT - 1; // w/o null terminator + char not_null_term_buf[buf_len]; + + size_t temp_buf_true_len = decode_html_entities_utf8_wo_unsafe_symbols_n(not_null_term_buf, INPUT, buf_len, "/\0>\0<\0!\0-\0\0"); + // printf("%.*s\n", temp_buf_true_len, not_null_term_buf); + assert(temp_buf_true_len == sizeof SAMPLE - 1); + assert(strncmp(not_null_term_buf, SAMPLE, temp_buf_true_len) == 0); + } + + { + char INPUT[] = ""; + + assert(decode_html_entities_utf8_wo_unsafe_symbols_n(INPUT, NULL, 0, "/\0>\0<\0!\0-\0\0") == 0); + } + + + { + char INPUT[] = "&#;П>авел>"; + + size_t proccessed = decode_html_entities_utf8_wo_unsafe_symbols_n(INPUT, NULL, sizeof INPUT - 1, "/\0>\0<\0!\0-\0\0"); + // printf("%d %.*s", proccessed, proccessed, INPUT); + // printf("%d", sizeof(">П>авел>") - 1); + assert(proccessed == sizeof("&#;П>авел>") - 1); + assert(strncmp(INPUT, "&#;П>авел>", proccessed) == 0); + + } + + fprintf(stdout, "All tests passed :-)\n"); return EXIT_SUCCESS; }