Skip to content

Commit

Permalink
Merge pull request #137 from openvenues/fix_address_parser_train
Browse files Browse the repository at this point in the history
Fix address_parser_train
  • Loading branch information
albarrentine authored Dec 12, 2016
2 parents d575cab + 8f1e699 commit bcf6b3c
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 19 deletions.
60 changes: 47 additions & 13 deletions src/address_parser_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set,
uint32_t i = 0;
char *str = NULL;

cstring_array *pairs = cstring_array_split(input, " ", 1, &count);
cstring_array *pairs = cstring_array_split_ignore_consecutive(input, " ", 1, &count);
size_t num_pairs = cstring_array_num_strings(pairs);

char *label = NULL;
Expand Down Expand Up @@ -62,23 +62,57 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set,
}

token.offset = pairs->indices->a[i];
token.len = last_separator_index;
size_t expected_len = last_separator_index;

scanner_t scanner = scanner_from_string(input + token.offset, token.len);
scanner_t scanner = scanner_from_string(input + token.offset, expected_len);
token.type = scan_token(&scanner);
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
continue;
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
// shouldn't happen but just in case
continue;
token.len = scanner.cursor - scanner.start;

if (token.len == expected_len) {
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
continue;
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
// shouldn't happen but just in case
continue;
} else {
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
}

cstring_array_add_string(labels, label);

token_array_push(tokens, token);
} else {
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
}
/* If normalizing the string turned one token into several e.g. ½ => 1/2
add all the tokens where offset = (token.offset + sub_token.offset)
with the same label as the parent.
*/
token_array *sub_tokens = token_array_new();
if (sub_tokens == NULL) {
log_error("Error allocating sub-token array\n");
return false;
}
tokenize_add_tokens(sub_tokens, input + token.offset, expected_len, false);
for (size_t j = 0; j < sub_tokens->n; j++) {
token_t sub_token = sub_tokens->a[j];
// Add the offset of the parent "token"
sub_token.offset = token.offset + sub_token.offset;

if (ADDRESS_PARSER_IS_SEPARATOR(sub_token.type)) {
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
continue;
} else if (ADDRESS_PARSER_IS_IGNORABLE(sub_token.type)) {
continue;
} else {
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
}

cstring_array_add_string(labels, label);
token_array_push(tokens, sub_token);
}

cstring_array_add_string(labels, label);
}

token_array_push(tokens, token);
})

cstring_array_destroy(pairs);
Expand Down
8 changes: 8 additions & 0 deletions src/address_parser_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ int main(int argc, char **argv) {

log_info("address dictionary module loaded\n");

// Needs to load for normalization
if (!transliteration_module_setup(NULL)) {
log_error("Could not load transliteration module\n");
exit(EXIT_FAILURE);
}

log_info("transliteration module loaded\n");

if (!geodb_module_setup(NULL)) {
log_error("Could not load geodb dictionaries\n");
exit(EXIT_FAILURE);
Expand Down
9 changes: 9 additions & 0 deletions src/address_parser_train.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "file_utils.h"
#include "geodb.h"
#include "shuffle.h"
#include "transliterate.h"

#include "log/log.h"

Expand Down Expand Up @@ -450,6 +451,14 @@ int main(int argc, char **argv) {

log_info("address dictionary module loaded\n");

// Needs to load for normalization
if (!transliteration_module_setup(NULL)) {
log_error("Could not load transliteration module\n");
exit(EXIT_FAILURE);
}

log_info("transliteration module loaded\n");

if (!geodb_module_setup(NULL)) {
log_error("Could not load geodb dictionaries\n");
exit(EXIT_FAILURE);
Expand Down
21 changes: 19 additions & 2 deletions src/string_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -829,17 +829,23 @@ inline int64_t cstring_array_token_length(cstring_array *self, uint32_t i) {
}
}

cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) {
static cstring_array *cstring_array_split_options(char *str, const char *separator, size_t separator_len, bool ignore_consecutive, size_t *count) {
*count = 0;
char_array *array = char_array_new_size(strlen(str));

bool last_was_separator = false;

while (*str) {
if ((separator_len == 1 && *str == separator[0]) || (memcmp(str, separator, separator_len) == 0)) {
char_array_push(array, '\0');
if (!ignore_consecutive || !last_was_separator) {
char_array_push(array, '\0');
}
str += separator_len;
last_was_separator = true;
} else {
char_array_push(array, *str);
str++;
last_was_separator = false;
}
}
char_array_push(array, '\0');
Expand All @@ -850,6 +856,17 @@ cstring_array *cstring_array_split(char *str, const char *separator, size_t sepa
return string_array;
}


cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) {
return cstring_array_split_options(str, separator, separator_len, false, count);
}


cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count) {
return cstring_array_split_options(str, separator, separator_len, true, count);
}


cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count) {
*count = 0;
char *ptr = str;
Expand Down
2 changes: 2 additions & 0 deletions src/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ char **cstring_array_to_strings(cstring_array *self);

// Split on delimiter
cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count);
// Split on delimiter, ignore multiple consecutive delimiters
cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count);

// Split on delimiter by replacing (single character) separator with the NUL byte in the original string
cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count);
Expand Down
15 changes: 11 additions & 4 deletions src/tokens.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ tokenized_string_t *tokenized_string_new(void) {
self->str = NULL;
self->strings = cstring_array_new();
self->tokens = token_array_new();

return self;
}

Expand All @@ -21,7 +20,11 @@ tokenized_string_t *tokenized_string_new_size(size_t len, size_t num_tokens) {

inline tokenized_string_t *tokenized_string_new_from_str_size(char *src, size_t len, size_t num_tokens) {
tokenized_string_t *self = tokenized_string_new_size(len, num_tokens);
self->str = src;
self->str = strndup(src, len);
if (self->str == NULL) {
tokenized_string_destroy(self);
return NULL;
}
return self;
}

Expand All @@ -38,7 +41,11 @@ void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_

tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens, bool copy_tokens) {
tokenized_string_t *self = malloc(sizeof(tokenized_string_t));
self->str = src;
self->str = strdup(src);
if (self->str == NULL) {
tokenized_string_destroy(self);
return NULL;
}
self->strings = cstring_array_new_size(strlen(src) + tokens->n);
if (copy_tokens) {
self->tokens = token_array_new_copy(tokens, tokens->n);
Expand All @@ -48,7 +55,7 @@ tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens,

token_t token;

for (int i = 0; i < tokens->n; i++) {
for (size_t i = 0; i < tokens->n; i++) {
token = tokens->a[i];
cstring_array_add_string_len(self->strings, src + token.offset, token.len);
}
Expand Down

0 comments on commit bcf6b3c

Please sign in to comment.