Skip to content

Commit

Permalink
begin tests for tokenizer code
Browse files Browse the repository at this point in the history
committing before fleshing out the API
  • Loading branch information
dormando committed Jul 26, 2024
1 parent d6c3bad commit 2c6dde6
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 69 deletions.
100 changes: 32 additions & 68 deletions mcmc.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
// TODO: if there's a parse error or unknown status code, we likely have a
// protocol desync and need to disconnect.

// NOTE: this _will_ change a bit for adding TLS support.

// A "reasonable" minimum buffer size to work with.
// Callers are allowed to create a buffer of any size larger than this.
// TODO: Put the math/documentation in here.
Expand Down Expand Up @@ -51,74 +49,28 @@ typedef struct mcmc_ctx {
// Find the starting offsets of each token; ignoring length.
// This creates a fast small (<= cacheline) index into the request,
// where we later scan or directly feed data into API's.
__attribute__((unused)) static int _mcmc_tokenize(mcmc_tokenizer_t *t, const char *line, size_t len, const int max) {
// TODO: do we need to find the \r\n and remove that from len?
const char *s = line;

// since multigets can be huge, we can't purely judge reqlen against this
// limit, but we also can't index past it since the tokens are shorts.
if (len > TOKENIZER_MAXLEN) {
len = TOKENIZER_MAXLEN;
}
const char *end = s + len;
int curtoken = 0;

int state = 0;
while (s != end) {
switch (state) {
case 0:
// scanning for first non-space to find a token.
if (*s != ' ') {
t->tokens[curtoken] = s - line;
if (++curtoken == max) {
s++;
state = 2;
break;
}
state = 1;
}
s++;
break;
case 1:
// advance over a token
if (*s != ' ') {
s++;
} else {
state = 0;
}
break;
case 2:
// hit max tokens before end of the line.
// keep advancing so we can place endcap token.
if (*s == ' ') {
goto endloop;
}
s++;
break;
}
}
endloop:

// endcap token so we can quickly find the length of any token by looking
// at the next one.
t->tokens[curtoken] = s - line;
t->ntokens = curtoken;

return 0;
}
#define _mcmc_tokenize(t, line, len, mstart, max) _mcmc_tokenize_meta(t, line, len, INT_MAX, max)

// specialized tokenizer for meta protocol commands or responses, fills in the
// bitflags while scanning.
// FIXME: keep one function and set mstart to INTMAX if not meta?
__attribute__((unused)) static int _mcmc_tokenize_meta(mcmc_tokenizer_t *t, const char *line, size_t len, const int mstart, const int max) {
// TODO: do we need to find the \r\n and remove that from len?
// Function _assumes_ const char *line ends with \n or \r\n, but will not
// break so long as the passed in 'len' is reasonable.
__attribute__((unused)) MCMC_STATIC int _mcmc_tokenize_meta(mcmc_tokenizer_t *t, const char *line, size_t len, const int mstart, const int max) {
// FIXME: detect \r\n and reduce len
const char *s = line;

// since multigets can be huge, we can't purely judge reqlen against this
// limit, but we also can't index past it since the tokens are shorts.
if (len > TOKENIZER_MAXLEN) {
len = TOKENIZER_MAXLEN;
}

if (line[len-2] == '\r') {
len -= 2;
} else {
len -= 1;
}

const char *end = s + len;
int curtoken = 0;

Expand Down Expand Up @@ -170,15 +122,27 @@ __attribute__((unused)) static int _mcmc_tokenize_meta(mcmc_tokenizer_t *t, cons
return 0;
}

/*static int _process_token_len(mcp_parser_t *pr, size_t token) {
const char *s = pr->request + pr->tokens[token];
const char *e = pr->request + pr->tokens[token+1];
__attribute__((unused)) MCMC_STATIC int _mcmc_token_len(const char *line, mcmc_tokenizer_t *t, size_t token) {
const char *s = line + t->tokens[token];
const char *e = line + t->tokens[token+1];
// start of next token is after any space delimiters, so back those out.
while (*(e-1) == ' ') {
e--;
}
return e - s;
}*/
}

__attribute__((unused)) MCMC_STATIC const char *_mcmc_token(const char *line, mcmc_tokenizer_t *t, size_t token, int *len) {
const char *s = line + t->tokens[token];
if (len != NULL) {
const char *e = line + t->tokens[token+1];
while (*(e-1) == ' ') {
e--;
}
*len = e - s;
}
return s;
}

static int _mcmc_parse_value_line(const char *buf, size_t read, mcmc_resp_t *r) {
// we know that "VALUE " has matched, so skip that.
Expand Down Expand Up @@ -505,7 +469,7 @@ static int _mcmc_parse_response(const char *buf, size_t read, mcmc_resp_t *r) {
// hardware boost.
// just need to split the mul and the add? if (__builtin_mul_overflow(etc))
// - need a method to force compile both functions for the test suite.
int mcmc_toktou32(const char *t, size_t len, uint32_t *out) {
MCMC_STATIC int mcmc_toktou32(const char *t, size_t len, uint32_t *out) {
uint32_t sum = 0;
const char *pos = t;
// We clamp the possible length to make input length errors less likely to
Expand All @@ -530,7 +494,7 @@ int mcmc_toktou32(const char *t, size_t len, uint32_t *out) {
return 0;
}

int mcmc_toktou64(const char *t, size_t len, uint64_t *out) {
MCMC_STATIC int mcmc_toktou64(const char *t, size_t len, uint64_t *out) {
uint64_t sum = 0;
const char *pos = t;
if (len > MCMC_TOKTO64_MAX) {
Expand All @@ -553,7 +517,7 @@ int mcmc_toktou64(const char *t, size_t len, uint64_t *out) {
return 0;
}

int mcmc_tokto32(const char *t, size_t len, int32_t *out) {
MCMC_STATIC int mcmc_tokto32(const char *t, size_t len, int32_t *out) {
int32_t sum = 0;
const char *pos = t;
int is_sig = 0;
Expand Down Expand Up @@ -591,7 +555,7 @@ int mcmc_tokto32(const char *t, size_t len, int32_t *out) {
return 0;
}

int mcmc_tokto64(const char *t, size_t len, int64_t *out) {
MCMC_STATIC int mcmc_tokto64(const char *t, size_t len, int64_t *out) {
int64_t sum = 0;
const char *pos = t;
int is_sig = 0;
Expand Down
14 changes: 14 additions & 0 deletions mcmc.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@
#include <sys/uio.h>
#include <stdint.h>

// Allow exposing normally static functions to a test suite running in a
// different module.
#ifdef MCMC_TEST
#define MCMC_STATIC
#else
#define MCMC_STATIC static
#endif

#define MCMC_OK 0
#define MCMC_ERR -1
#define MCMC_NOT_CONNECTED 1
Expand Down Expand Up @@ -124,8 +132,14 @@ int mcmc_request_writev(void *c, const struct iovec *iov, int iovcnt, ssize_t *s
int mcmc_disconnect(void *c);
void mcmc_get_error(void *c, char *code, size_t clen, char *msg, size_t mlen);

#ifdef MCMC_TEST
int mcmc_toktou32(const char *t, size_t len, uint32_t *out);
int mcmc_toktou64(const char *t, size_t len, uint64_t *out);
int mcmc_tokto32(const char *t, size_t len, int32_t *out);
int mcmc_tokto64(const char *t, size_t len, int64_t *out);
int _mcmc_tokenize_meta(mcmc_tokenizer_t *t, const char *line, size_t len, const int mstart, const int max);
int _mcmc_token_len(const char *line, mcmc_tokenizer_t *t, size_t token);
const char *_mcmc_token(const char *line, mcmc_tokenizer_t *t, size_t token, int *len);
#endif

#endif // MCMC_HEADER
2 changes: 1 addition & 1 deletion tests/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PREFIX=/usr/local

all:
gcc -g -Wall -Werror -pedantic -o main main.c ../mcmc.c
gcc -g -Wall -Werror -pedantic -DMCMC_TEST=1 -o main main.c ../mcmc.c

clean:
rm -f main main.o
Expand Down
86 changes: 86 additions & 0 deletions tests/main.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,92 @@
#include "utest.h"
#define MCMC_TEST 1 // FIXME: re-run bear and remove this.
#include "../mcmc.h"

// token check.
struct mc_tc {
int len;
char *tok;
};

#define MAX_TC 100
struct mc_tokenize {
mcmc_tokenizer_t t;
int res; // result from tokenizer run
int ntokens;
uint64_t metaflags;
const char *line;
int llen;
int ntc;
struct mc_tc tc[MAX_TC];
};

UTEST_F_SETUP(mc_tokenize) {
// TODO: check if utest is doing this for us.
memset(utest_fixture, 0, sizeof(*utest_fixture));
}

UTEST_F_TEARDOWN(mc_tokenize) {
if (utest_fixture->res == 0) {
ASSERT_EQ(utest_fixture->t.ntokens, utest_fixture->ntokens);
ASSERT_EQ(utest_fixture->t.metaflags, utest_fixture->metaflags);
for (int x = 0; x < utest_fixture->ntc; x++) {
struct mc_tc tc = utest_fixture->tc[x];
ASSERT_EQ(_mcmc_token_len(utest_fixture->line, &utest_fixture->t, x), tc.len);
ASSERT_STRNEQ(tc.tok, _mcmc_token(utest_fixture->line, &utest_fixture->t, x, NULL), tc.len);
}
}
// else assume the main utest is doing some validation.
}

#define M(n, k) \
do { \
utest_fixture->line = line; \
utest_fixture->llen = llen; \
utest_fixture->ntc = n; \
utest_fixture->ntokens = k; \
memcpy(utest_fixture->tc, c, sizeof(c)); \
utest_fixture->res = res; \
} while(0); \

UTEST_F(mc_tokenize, asciiset) {
const char *line = "set foo 5 10 2\r\n";
int llen = strlen(line);
struct mc_tc c[5] = {
{3, "set"}, {3, "foo"}, {1, "5"}, {2, "10"}, {1, "2"},
};

int res = _mcmc_tokenize_meta(&utest_fixture->t, line, llen, 999, MCMC_PARSER_MAX_TOKENS-1);
M(5, 5)
}

UTEST_F(mc_tokenize, asciiget) {
const char *line = "get foobar\r\n";
int llen = strlen(line);
struct mc_tc c[2] = {
{3, "get"}, {6, "foobar"},
};
int res = _mcmc_tokenize_meta(&utest_fixture->t, line, llen, 999, MCMC_PARSER_MAX_TOKENS-1);
M(2, 2)
}

// give a shorter len than the string and ensure proper parsing
UTEST_F(mc_tokenize, asciishort) {
const char *line = "one two three four\r\n";
int llen = strlen(line) - 7;
struct mc_tc c[3] = {
{3, "one"}, {3, "two"}, {4, "thre"},
};
int res = _mcmc_tokenize_meta(&utest_fixture->t, line, llen, 999, MCMC_PARSER_MAX_TOKENS-1);
M(3, 3)
}

// TODO:
// - check meta lines
// - check what happens when garbage is given
// - add checks for mcmc_token_toetc functions

#undef M

#define MAX 1024
struct mc_valid {
mcmc_resp_t r;
Expand Down

0 comments on commit 2c6dde6

Please sign in to comment.