Skip to content

Commit

Permalink
- json: utf-8 validation optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
oknenavin committed May 18, 2024
1 parent 38e6238 commit 382365a
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 52 deletions.
148 changes: 112 additions & 36 deletions src/cxon/lang/common/cio/char.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ namespace cxon { namespace cio { namespace chr { // character conversion: read
-> enable_if_t<is_char_8<T>::value, int>;

template <typename II>
CXON_INLAY int utf8_check(II i, II e);
CXON_INLAY auto utf8_check(II i, II e) -> enable_if_t< is_random_access_iterator<II>::value, int>;
template <typename II>
CXON_INLAY auto utf8_check(II i, II e) -> enable_if_t<!is_random_access_iterator<II>::value, int>;

}}}

Expand Down Expand Up @@ -287,7 +289,9 @@ namespace cxon { namespace cio { namespace chr {
}

template <typename II>
CXON_INLAY int utf8_check(II i, II e) {
CXON_INLAY auto utf8_check(II i, II e)
-> enable_if_t< is_random_access_iterator<II>::value, int>
{
// http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf
// p41, Table 3-7. Well-Formed UTF-8 Byte Sequences
CXON_ASSERT(i != e, "unexpected");
Expand All @@ -297,47 +301,119 @@ namespace cxon { namespace cio { namespace chr {
//if (c0 < 0x80)
// return 1;
// 2
if (c0 >= 0xC2 && c0 <= 0xDF) {
c1 = imp::head_(i, e);
if (c1 >= 0x80 && c1 <= 0xBF)
return 2;
if (c0 < 0xE0) {
if (e - i < 2) return 0;
if (c0 >= 0xC2 && c0 <= 0xDF) {
c1 = (unsigned char)*++i;
if (c1 >= 0x80 && c1 <= 0xBF)
return 2;
}
}
// 3
else if (c0 == 0xE0) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e);
if (c1 >= 0xA0 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
else if (c0 < 0xF0) {
if (e - i < 3) return 0;
if (c0 == 0xE0) {
c1 = (unsigned char)*++i, c2 = (unsigned char)*++i;
if (c1 >= 0xA0 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
}
else if (c0 >= 0xE1 && c0 <= 0xEC) {
c1 = (unsigned char)*++i, c2 = (unsigned char)*++i;
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
}
else if (c0 == 0xED) {
c1 = (unsigned char)*++i, c2 = (unsigned char)*++i;
if (c1 >= 0x80 && c1 <= 0x9F && c2 >= 0x80 && c2 <= 0xBF)
return 3;
}
else if (c0 >= 0xEE && c0 <= 0xEF) {
c1 = (unsigned char)*++i, c2 = (unsigned char)*++i;
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
}
}
else if (c0 >= 0xE1 && c0 <= 0xEC) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
// 4
else if (c0 < 0xF8) {
if (e - i < 4) return 0;
if (c0 == 0xF0) {
c1 = (unsigned char)*++i, c2 = (unsigned char)*++i, c3 = (unsigned char)*++i;
if (c1 >= 0x90 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
}
else if (c0 >= 0xF1 && c0 <= 0xF3) {
c1 = (unsigned char)*++i, c2 = (unsigned char)*++i, c3 = (unsigned char)*++i;
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
}
else if (c0 == 0xF4) {
c1 = (unsigned char)*++i, c2 = (unsigned char)*++i, c3 = (unsigned char)*++i;
if (c1 >= 0x80 && c1 <= 0x8F && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
}
}
else if (c0 == 0xED) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0x9F && c2 >= 0x80 && c2 <= 0xBF)
return 3;
return 0;
}
template <typename II>
CXON_INLAY auto utf8_check(II i, II e)
-> enable_if_t<!is_random_access_iterator<II>::value, int>
{
// http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf
// p41, Table 3-7. Well-Formed UTF-8 Byte Sequences
CXON_ASSERT(i != e, "unexpected");
unsigned const c0 = (unsigned char)*i;
unsigned c1, c2, c3;
// 1
//if (c0 < 0x80)
// return 1;
// 2
if (c0 < 0xE0) {
if (c0 >= 0xC2 && c0 <= 0xDF) {
c1 = imp::head_(i, e);
if (c1 >= 0x80 && c1 <= 0xBF)
return 2;
}
}
else if (c0 >= 0xEE && c0 <= 0xEF) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
// 3
else if (c0 < 0xF0) {
if (c0 == 0xE0) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e);
if (c1 >= 0xA0 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
}
else if (c0 >= 0xE1 && c0 <= 0xEC) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
}
else if (c0 == 0xED) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0x9F && c2 >= 0x80 && c2 <= 0xBF)
return 3;
}
else if (c0 >= 0xEE && c0 <= 0xEF) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF)
return 3;
}
}
// 4
else if (c0 == 0xF0) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e), c3 = imp::tail_(i, e);
if (c1 >= 0x90 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
}
else if (c0 >= 0xF1 && c0 <= 0xF3) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e), c3 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
}
else if (c0 == 0xF4) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e), c3 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0x8F && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
else if (c0 < 0xF8) {
if (c0 == 0xF0) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e), c3 = imp::tail_(i, e);
if (c1 >= 0x90 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
}
else if (c0 >= 0xF1 && c0 <= 0xF3) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e), c3 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0xBF && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
}
else if (c0 == 0xF4) {
c1 = imp::head_(i, e), c2 = imp::tail_(i, e), c3 = imp::tail_(i, e);
if (c1 >= 0x80 && c1 <= 0x8F && c2 >= 0x80 && c2 <= 0xBF && c3 >= 0x80 && c3 <= 0xBF)
return 4;
}
}
return 0;
}
Expand Down
32 changes: 16 additions & 16 deletions src/cxon/lang/common/cio/chcls.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,22 @@ namespace cxon { namespace cio { namespace chr { // character classes
C8 = ALPH|REAL
};
static constexpr unsigned char is_x_[] = {
C0,C0,C0,C0,C0,C0,C0,C0,C0,C3,C3,C2,C2,C3,C0,C0,
C0,C0,C0,C0,C0,C0,C0,C0,C0,C0,C0,C0,C0,C0,C0,C0,
C4,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,
C5,C5,C5,C5,C5,C5,C5,C5,C6,C6,C1,C1,C1,C1,C1,C1,
C1,C7,C7,C7,C7,C7,C7,C8,C8,C8,C8,C8,C8,C8,C8,C8,
C8,C8,C8,C8,C8,C8,C8,C8,C8,C8,C8,C1,C1,C1,C1,C1,
C1,C7,C7,C7,C7,C7,C7,C8,C8,C8,C8,C8,C8,C8,C8,C8,
C8,C8,C8,C8,C8,C8,C8,C8,C8,C8,C8,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,C1,00
C0,C0,C0,C0,C0,C0,C0,C0, C0,C3,C3,C2,C2,C3,C0,C0,
C0,C0,C0,C0,C0,C0,C0,C0, C0,C0,C0,C0,C0,C0,C0,C0,
C4,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,C1,
C5,C5,C5,C5,C5,C5,C5,C5, C6,C6,C1,C1,C1,C1,C1,C1,
C1,C7,C7,C7,C7,C7,C7,C8, C8,C8,C8,C8,C8,C8,C8,C8,
C8,C8,C8,C8,C8,C8,C8,C8, C8,C8,C8,C1,C1,C1,C1,C1,
C1,C7,C7,C7,C7,C7,C7,C8, C8,C8,C8,C8,C8,C8,C8,C8,
C8,C8,C8,C8,C8,C8,C8,C8, C8,C8,C8,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,C1,
C1,C1,C1,C1,C1,C1,C1,C1, C1,C1,C1,C1,C1,C1,C1,00
};

template <typename X>
Expand Down

0 comments on commit 382365a

Please sign in to comment.