Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ ADD_LIBRARY(entities STATIC
entities.c
)


# Build unit cases
ADD_EXECUTABLE(test-entities
t-entities.c
Expand Down
302 changes: 296 additions & 6 deletions entities.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
#include "entities.h"

#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h> /* LONG_MAX */
#include <ctype.h> /* isspace() */

#define UNICODE_MAX 0x10FFFFul

Expand Down Expand Up @@ -239,10 +240,10 @@ static const char *const NAMED_ENTITIES[][2] = {
{ "sub;", "⊂" },
{ "sube;", "⊆" },
{ "sum;", "∑" },
{ "sup;", "⊃" },
{ "sup1;", "¹" },
{ "sup2;", "²" },
{ "sup3;", "³" },
{ "sup;", "⊃" },
{ "supe;", "⊇" },
{ "szlig;", "ß" },
{ "tau;", "τ" },
Expand Down Expand Up @@ -276,7 +277,7 @@ static const char *const NAMED_ENTITIES[][2] = {
static int cmp(const void *key, const void *value)
{
return strncmp((const char *)key, *(const char *const *)value,
strlen(*(const char *const *)value));
strlen(*(const char *const *)value)); // strlen?
}

static const char *get_named_entity(const char *name)
Expand Down Expand Up @@ -325,7 +326,7 @@ static size_t putc_utf8(unsigned long cp, char *buffer)
return 0;
}

static bool parse_entity(
static _Bool parse_entity(
const char *current, char **to, const char **from)
{
const char *end = strchr(current, ';');
Expand All @@ -335,13 +336,13 @@ static bool parse_entity(
{
char *tail = NULL;
int errno_save = errno;
bool hex = current[2] == 'x' || current[2] == 'X';
_Bool hex = current[2] == 'x' || current[2] == 'X';

errno = 0;
unsigned long cp = strtoul(
current + (hex ? 3 : 2), &tail, hex ? 16 : 10);

bool fail = errno || tail != end || cp > UNICODE_MAX;
_Bool fail = errno || tail != end || cp > UNICODE_MAX;
errno = errno_save;
if(fail) return 0;

Expand All @@ -363,6 +364,7 @@ static bool parse_entity(
return 1;
}


size_t decode_html_entities_utf8(char *dest, const char *src)
{
if(!src) src = dest;
Expand Down Expand Up @@ -391,3 +393,291 @@ size_t decode_html_entities_utf8(char *dest, const char *src)
return (size_t)(to - dest);
}


static _Bool parse_entity_wo_unsafe_symbols(
const char *current, char **to, const char **from,
const char* unsafe_symbs)
{
const char *end = strchr(current, ';');
if(!end) return 0;

if(current[1] == '#')
{
char *tail = NULL;
int errno_save = errno;
_Bool hex = current[2] == 'x' || current[2] == 'X';

errno = 0;
unsigned long cp = strtoul(
current + (hex ? 3 : 2), &tail, hex ? 16 : 10);

_Bool fail = errno || tail != end || cp > UNICODE_MAX;
errno = errno_save;
if(fail) return 0;


// *to += putc_utf8(cp, *to);
size_t utf8_symb_len = putc_utf8(cp, *to);

size_t unsafe_symbs_len;
for (const char* unsafe_symb = unsafe_symbs; (unsafe_symbs_len = strlen(unsafe_symb)) != 0; unsafe_symb += (unsafe_symbs_len + 1))
{
if (utf8_symb_len == unsafe_symbs_len && strncmp(*to, unsafe_symb, utf8_symb_len) == 0)
{
// rollback
size_t html_entities_len = (size_t)(end - current) + 1;
memmove(*to, current, html_entities_len);
utf8_symb_len = html_entities_len;
break;
}
}

*to += utf8_symb_len;
*from = end + 1;

return 1;
}

const char *entity = get_named_entity(&current[1]);
if(!entity) return 0;

size_t len = strlen(entity);
memcpy(*to, entity, len);

*to += len;
*from = end + 1;

return 1;
}

size_t decode_html_entities_utf8_wo_unsafe_symbols(char *dest, const char *src,
const char* unsafe_symbs)
{
if(!src) src = dest;

char *to = dest;
const char *from = src;

for(const char *current; (current = strchr(from, '&'));)
{
memmove(to, from, (size_t)(current - from));
to += current - from;

if(parse_entity_wo_unsafe_symbols(current, &to, &from, unsafe_symbs))
continue;

from = current;
*to++ = *from++;
}

size_t remaining = strlen(from);

memmove(to, from, remaining);
to += remaining;
*to = 0;

return (size_t)(to - dest);
}



static int strcmp_n(const char *lhs, size_t lhs_len, const char *rhs)
{
size_t rhs_len = strlen(rhs);

if (rhs_len > lhs_len)
return rhs[lhs_len];
else if (rhs_len < lhs_len)
return lhs[rhs_len];

// equal len
int i = 0;
while (lhs[i] == rhs[i] && lhs_len > 0){
++i;
--lhs_len;
}

return lhs_len == 0 ? 0 : lhs[i] - rhs[i];
}

static size_t clang_is_good = 0;

static int cmp_n(const void *key, const void *value)
{
return strcmp_n((const char *)key, clang_is_good,
(const char *)value);
}

static const char *get_named_entity_n(const char *name, size_t name_len)
{
clang_is_good = name_len;
const char *const *entity = (const char *const *)bsearch(name,
NAMED_ENTITIES, sizeof NAMED_ENTITIES / sizeof *NAMED_ENTITIES,
sizeof *NAMED_ENTITIES, cmp_n);

return entity ? entity[1] : NULL;
}

static const char* strchr_n(const char* src, size_t src_size, int chr)
{
size_t i;
for (i = 0; i < src_size && src[i] != '\0'; ++i)
{
if ((int)src[i] == chr)
{
break;
}
}

return i < src_size ? &src[i] : NULL;
}

/*https://stackoverflow.com/questions/7457163/what-is-the-implementation-of-strtol*/
static unsigned long
strtoul_n(const char *restrict nptr, size_t nptr_len, char **restrict endptr, int base) {
const char *p = nptr, *endp;
_Bool overflow = 0;
/* Need long long so (LONG_MAX) can fit in these: */
long long n = 0UL, cutoff;
int cutlim;
if (base < 0 || base == 1 || base > 36) {
#ifdef EINVAL /* errno value defined by POSIX */
errno = EINVAL;
#endif
return 0L;
}
endp = nptr;
while (nptr_len > 0 && isspace(*p)){
p++;
--nptr_len;
}

if (base == 0)
base = 10;

cutoff = LONG_MAX / base;
cutlim = LONG_MAX % base;
while (nptr_len > 0) {
int c;
if (*p >= 'A')
c = ((*p - 'A') & (~('a' ^ 'A'))) + 10;
else if (*p <= '9')
c = *p - '0';
else
break;
if (c < 0 || c >= base) break;
endp = ++p;
if (overflow) {
/* endptr should go forward and point to the non-digit character
* (of the given base); required by ANSI standard. */
if (endptr) continue;
break;
}
if (n > cutoff || (n == cutoff && c > cutlim)) {
overflow = 1; continue;
}
n = n * base + c;
--nptr_len;
}

if (endptr) *endptr = (char *)endp;
if (overflow) {
errno = ERANGE;
return LONG_MAX;
}

return (unsigned long)n;
}

static _Bool parse_entity_wo_unsafe_symbols_n(
const char *current, size_t* curr_size,
char **to, const char **from,
const char* unsafe_symbs)
{
const char *end = strchr_n(current, *curr_size, ';');
if(!end) return 0;

// *curr_size should be more than 3 (start symb, # and ;)
size_t entity_len = (end - current);
if(entity_len > 3 && current[1] == '#')
{
char *tail = NULL;
int errno_save = errno;
_Bool hex = entity_len > 4 && (current[2] == 'x' || current[2] == 'X');

errno = 0;
unsigned long cp = strtoul_n(
current + (hex ? 3 : 2), *curr_size - (hex ? 3 : 2), &tail, hex ? 16 : 10);

_Bool fail = errno || tail != end || cp > UNICODE_MAX;
errno = errno_save;


if(fail) return 0;

size_t utf8_symb_len = putc_utf8(cp, *to);

size_t unsafe_symbs_len;
for (const char* unsafe_symb = unsafe_symbs; (unsafe_symbs_len = strlen(unsafe_symb)) != 0; unsafe_symb += (unsafe_symbs_len + 1))
{
if (utf8_symb_len == unsafe_symbs_len && strncmp(*to, unsafe_symb, utf8_symb_len) == 0)
{
// rollback
size_t html_entities_len = (size_t)(end - current) + 1;
memmove(*to, current, html_entities_len);
utf8_symb_len = html_entities_len;
break;
}
}

*to += utf8_symb_len;
*from = end + 1;

*curr_size -= end - current + 1;
return 1;
}

if (*curr_size < 2)
return 0;

const char *entity = get_named_entity_n(&current[1], *curr_size - 1);
if(!entity) return 0;

size_t len = strlen(entity);
memcpy(*to, entity, len);

*to += len;
*from = end + 1;
*curr_size -= end - current + 1;

return 1;
}

size_t decode_html_entities_utf8_wo_unsafe_symbols_n(char *dest, const char *src,
size_t src_size, const char* unsafe_symbs)
{
if(!src) src = dest;

char *to = dest;
const char *from = src;

for(const char *current; (current = strchr_n(from, src_size, '&'));)
{
memmove(to, from, (size_t)(current - from));
to += current - from;
src_size -= current - from;

if(parse_entity_wo_unsafe_symbols_n(current, &src_size, &to, &from, unsafe_symbs))
continue;

from = current;
src_size -= current - from;
*to++ = *from++;
src_size -= 1;
}

memmove(to, from, src_size);
to += src_size;

return (size_t)(to - dest);
}
19 changes: 17 additions & 2 deletions entities.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,23 @@ extern size_t decode_html_entities_utf8(char *dest, const char *src);
If <src> is <NULL>, input will be taken from <dest>, decoding
the entities in-place.

The function returns the length of the decoded string.
The function returns the size of the decoded string.
*/

#endif
extern size_t decode_html_entities_utf8_wo_unsafe_symbols(char *dest, const char *src,
const char* unsafe_symbs);
/* Takes one more params <unsafe_symbs> string delimited '\0' of anscii characters
that prevented to decode. Ends by double '\0'

*/

extern size_t decode_html_entities_utf8_wo_unsafe_symbols_n(char *dest, const char *src,
size_t src_size, const char* unsafe_symbs);
/* Takes one more params <src_size> that indicated
how many characters must be decode i.e. size of <src>

<src> may be not null terminated!
*/

#endif // DECODE_HTML_ENTITIES_UTF8_

Loading