diff --git a/common/scanner.h b/common/scanner.h
index 2416d906..e7384099 100644
--- a/common/scanner.h
+++ b/common/scanner.h
@@ -84,10 +84,12 @@ enum TokenType {
 typedef struct {
     uint32_t len;
     uint32_t cap;
-    char *data;
+    uint16_t *data;
 } String;
 
-static String string_new() { return (String){.cap = 16, .len = 0, .data = calloc(1, sizeof(char) * 17)}; }
+static String string_new() {
+    return (String){.cap = 16, .len = 0, .data = calloc(17, sizeof(uint16_t))};
+}
 
 typedef struct {
     String word;
@@ -114,6 +116,20 @@ typedef struct {
 
 typedef enum { Error, End } ScanContentResult;
 
+static inline signed string_cmp(String a, String b) {
+    if (a.len != b.len) {
+        return a.len < b.len ? -1 : 1;
+    }
+
+    for (uint32_t i = 0; i < a.len; i++) {
+        if (a.data[i] != b.data[i]) {
+            return a.data[i] < b.data[i] ? -1 : 1;
+        }
+    }
+
+    return 0;
+}
+
 static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
 
 static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
@@ -124,13 +140,14 @@ static unsigned serialize(Scanner *scanner, char *buffer) {
     buffer[size++] = (char)scanner->open_heredocs.len;
     for (unsigned j = 0; j < scanner->open_heredocs.len; j++) {
         Heredoc *heredoc = &scanner->open_heredocs.data[j];
-        if (size + 2 + heredoc->word.len >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
+        unsigned word_bytes = heredoc->word.len * sizeof(heredoc->word.data[0]);
+        if (size + 2 + word_bytes >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
             return 0;
         }
         buffer[size++] = (char)heredoc->end_word_indentation_allowed;
         buffer[size++] = (char)heredoc->word.len;
-        memcpy(&buffer[size], heredoc->word.data, heredoc->word.len);
-        size += heredoc->word.len;
+        memcpy(&buffer[size], heredoc->word.data, word_bytes);
+        size += word_bytes;
     }
 
     return size;
@@ -151,10 +168,11 @@ static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
         heredoc.end_word_indentation_allowed = buffer[size++];
         heredoc.word = string_new();
         uint8_t word_length = buffer[size++];
+        unsigned word_bytes = word_length * sizeof(heredoc.word.data[0]);
         STRING_GROW(heredoc.word, word_length);
-        memcpy(heredoc.word.data, buffer + size, word_length);
+        memcpy(heredoc.word.data, buffer + size, word_bytes);
         heredoc.word.len = word_length;
-        size += word_length;
+        size += word_bytes;
         VEC_PUSH(scanner->open_heredocs, heredoc);
     }
 }
@@ -182,7 +200,9 @@ static inline bool scan_whitespace(TSLexer *lexer) {
     }
 }
 
-static inline bool is_valid_name_char(TSLexer *lexer) { return iswalnum(lexer->lookahead) || lexer->lookahead == '_'; }
+static inline bool is_valid_name_char(TSLexer *lexer) {
+    return iswalnum(lexer->lookahead) || lexer->lookahead == '_' || lexer->lookahead >= 0x80;
+}
 
 static inline bool is_escapable_sequence(TSLexer *lexer) {
     // Note: remember to also update the escape_sequence rule in the
@@ -346,7 +366,7 @@ static bool scan_encapsed_part_string(Scanner *scanner, TSLexer *lexer, bool is_
             case '\\':
                 advance(lexer);
 
-                // \{ should not be interprented as an escape sequence, but both
+                // \{ should not be interpreted as an escape sequence, but both
                 // should be consumed as normal characters
                 if (lexer->lookahead == '{') {
                     advance(lexer);
@@ -499,7 +519,7 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
         }
 
         String word = scan_heredoc_word(lexer);
-        if (strcmp(word.data, heredoc.word.data) != 0) {
+        if (string_cmp(word, heredoc.word) != 0) {
             STRING_FREE(word);
             return false;
         }