diff --git a/common/scanner.h b/common/scanner.h index 2416d906..e7384099 100644 --- a/common/scanner.h +++ b/common/scanner.h @@ -84,10 +84,12 @@ enum TokenType { typedef struct { uint32_t len; uint32_t cap; - char *data; + uint16_t *data; } String; -static String string_new() { return (String){.cap = 16, .len = 0, .data = calloc(1, sizeof(char) * 17)}; } +static String string_new() { + return (String){.cap = 16, .len = 0, .data = calloc(17, sizeof(uint16_t))}; +} typedef struct { String word; @@ -114,6 +116,20 @@ typedef struct { typedef enum { Error, End } ScanContentResult; +static inline signed string_cmp(String a, String b) { + if (a.len != b.len) { + return a.len < b.len ? -1 : 1; + } + + for (uint32_t i = 0; i < a.len; i++) { + if (a.data[i] != b.data[i]) { + return a.data[i] < b.data[i] ? -1 : 1; + } + } + + return 0; +} + static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } @@ -124,13 +140,14 @@ static unsigned serialize(Scanner *scanner, char *buffer) { buffer[size++] = (char)scanner->open_heredocs.len; for (unsigned j = 0; j < scanner->open_heredocs.len; j++) { Heredoc *heredoc = &scanner->open_heredocs.data[j]; - if (size + 2 + heredoc->word.len >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + unsigned word_bytes = heredoc->word.len * sizeof(heredoc->word.data[0]); + if (size + 2 + word_bytes >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { return 0; } buffer[size++] = (char)heredoc->end_word_indentation_allowed; buffer[size++] = (char)heredoc->word.len; - memcpy(&buffer[size], heredoc->word.data, heredoc->word.len); - size += heredoc->word.len; + memcpy(&buffer[size], heredoc->word.data, word_bytes); + size += word_bytes; } return size; @@ -151,10 +168,11 @@ static void deserialize(Scanner *scanner, const char *buffer, unsigned length) { heredoc.end_word_indentation_allowed = buffer[size++]; heredoc.word = string_new(); uint8_t word_length = buffer[size++]; + unsigned word_bytes = word_length * sizeof(heredoc.word.data[0]); STRING_GROW(heredoc.word, word_length); - memcpy(heredoc.word.data, buffer + size, word_length); + memcpy(heredoc.word.data, buffer + size, word_bytes); heredoc.word.len = word_length; - size += word_length; + size += word_bytes; VEC_PUSH(scanner->open_heredocs, heredoc); } } @@ -182,7 +200,9 @@ static inline bool scan_whitespace(TSLexer *lexer) { } } -static inline bool is_valid_name_char(TSLexer *lexer) { return iswalnum(lexer->lookahead) || lexer->lookahead == '_'; } +static inline bool is_valid_name_char(TSLexer *lexer) { + return iswalnum(lexer->lookahead) || lexer->lookahead == '_' || lexer->lookahead >= 0x80; +} static inline bool is_escapable_sequence(TSLexer *lexer) { // Note: remember to also update the escape_sequence rule in the @@ -346,7 +366,7 @@ static bool scan_encapsed_part_string(Scanner *scanner, TSLexer *lexer, bool is_ case '\\': advance(lexer); - // \{ should not be interprented as an escape sequence, but both + // \{ should not be interpreted as an escape sequence, but both // should be consumed as normal characters if (lexer->lookahead == '{') { advance(lexer); @@ -499,7 +519,7 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { } String word = scan_heredoc_word(lexer); - if (strcmp(word.data, heredoc.word.data) != 0) { + if (string_cmp(word, heredoc.word) != 0) { STRING_FREE(word); return false; }