Skip to content

Commit

Permalink
Fix #29: Fix 0x03 causing early EOF with tokeniser
Browse files Browse the repository at this point in the history
  • Loading branch information
TeamSpen210 committed Jun 24, 2024
1 parent 417eda0 commit 245a0a0
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 46 deletions.
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Version (dev)
* Add :py:attr:`~srctools.tokenizer.Tokenizer.plus_operator`, allowing `+` to be
parsed as an operator for FGDs but still be valid inside bare strings elsewhere.
These are common in ``gameinfo.txt``.
* Fix #29: Fix 0x03 causing early EOF with tokeniser.

--------------
Version 2.3.17
Expand Down
92 changes: 46 additions & 46 deletions src/srctools/_tokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ struct TokFlags {
# The number of characters to read from a file each time.
cdef enum:
FILE_BUFFER = 1024
CHR_EOF = 0x03 # Indicate the end of the file.

# noinspection PyMissingTypeHints
cdef class BaseTokenizer:
Expand Down Expand Up @@ -488,11 +487,11 @@ cdef class Tokenizer(BaseTokenizer):
# So pull off the first three characters, and if they don't match,
# rebuild the cur_chunk to allow them.
# The BOM is b'\xef\xbb\xbf'.
if self._next_char() != 0xef:
if self._next_char()[0] != 0xef:
self.char_index -= 1
elif self._next_char() != 0xbb:
elif self._next_char()[0] != 0xbb:
self.char_index -= 2
elif self._next_char() != 0xbf:
elif self._next_char()[0] != 0xbf:
self.char_index -= 3

@property
Expand Down Expand Up @@ -588,17 +587,17 @@ cdef class Tokenizer(BaseTokenizer):
# We check all the getitem[] accesses, so don't have Cython recheck.
@cython.boundscheck(False)
@cython.wraparound(False)
cdef uchar _next_char(self) except? CHR_EOF:
"""Return the next character, or 0 if no more characters are there."""
cdef (uchar, bint) _next_char(self):
"""Return the next character, and a flag to indicate if more characters are present."""
cdef str chunk
cdef object chunk_obj

self.char_index += 1
if self.char_index < self.chunk_size:
return self.chunk_buf[self.char_index]
return self.chunk_buf[self.char_index], False

if self.chunk_iter is None:
return CHR_EOF
return b'\x00', True

if self.flags.file_input:
try:
Expand All @@ -613,10 +612,10 @@ cdef class Tokenizer(BaseTokenizer):
raise ValueError('Expected string, got ' + type(self.cur_chunk).__name__)

if self.chunk_size > 0:
return self.chunk_buf[0]
return self.chunk_buf[0], False
else:
self.chunk_iter = None
return CHR_EOF
return b'\x00', True

# Retrieve a chunk from the iterable.
# Skip empty chunks (shouldn't be there.)
Expand All @@ -630,7 +629,7 @@ cdef class Tokenizer(BaseTokenizer):
if chunk_obj is None:
# Out of characters after empty chunks
self.chunk_iter = None
return CHR_EOF
return b'\x00', True

if isinstance(chunk_obj, bytes):
raise ValueError('Cannot parse binary data!')
Expand All @@ -641,7 +640,7 @@ cdef class Tokenizer(BaseTokenizer):
self.cur_chunk = chunk_obj
self.char_index = 0
self.chunk_buf = <const uchar *>PyUnicode_AsUTF8AndSize(self.cur_chunk, &self.chunk_size)
return self.chunk_buf[0]
return self.chunk_buf[0], False

def _get_token(self):
"""Compute the next token."""
Expand All @@ -653,6 +652,7 @@ cdef class Tokenizer(BaseTokenizer):
uchar next_char
uchar escape_char
uchar peek_char
bint is_eof
int start_line
bint ascii_only
uchar decode[5]
Expand All @@ -666,12 +666,12 @@ cdef class Tokenizer(BaseTokenizer):
return output

while True:
next_char = self._next_char()
next_char, is_eof = self._next_char()
# First try simple operators & EOF.
if next_char == CHR_EOF:
if is_eof:
return EOF_TUP

elif next_char == b'{':
if next_char == b'{':
return BRACE_OPEN_TUP
elif next_char == b'}':
return BRACE_CLOSE_TUP
Expand All @@ -696,32 +696,32 @@ cdef class Tokenizer(BaseTokenizer):
self.flags.last_was_cr = False

if next_char in b' \t':
# Ignore whitespace..
# Ignore whitespace...
continue

# Comments
elif next_char == b'/':
# The next must be another slash! (//)
next_char = self._next_char()
next_char, is_eof = self._next_char()
if next_char == b'*': # /* comment.
if self.flags.allow_star_comments:
start_line = self.line_num
save_comments = self.flags.preserve_comments
while True:
next_char = self._next_char()
if next_char == CHR_EOF:
next_char, is_eof = self._next_char()
if is_eof:
raise self._error(
f'Unclosed /* comment '
f'(starting on line {start_line})!',
)
elif next_char == b'\n':
if next_char == b'\n':
self.line_num += 1
if save_comments:
self.buf_add_char(next_char)
elif next_char == b'*':
# Check next, next character!
peek_char = self._next_char()
if peek_char == CHR_EOF:
peek_char, is_eof = self._next_char()
if is_eof:
raise self._error(
f'Unclosed /* comment '
f'(starting on line {start_line})!',
Expand All @@ -745,8 +745,8 @@ cdef class Tokenizer(BaseTokenizer):
# Skip to end of line.
save_comments = self.flags.preserve_comments
while True:
next_char = self._next_char()
if next_char == CHR_EOF or next_char == b'\n':
next_char, is_eof = self._next_char()
if is_eof or next_char == b'\n':
break
if save_comments:
self.buf_add_char(next_char)
Expand All @@ -770,10 +770,10 @@ cdef class Tokenizer(BaseTokenizer):
self.buf_reset()
last_was_cr = False
while True:
next_char = self._next_char()
if next_char == CHR_EOF:
next_char, is_eof = self._next_char()
if is_eof:
raise self._error('Unterminated string!')
elif next_char == b'"':
if next_char == b'"':
return STRING, self.buf_get_text()
elif next_char == b'\r':
self.line_num += 1
Expand All @@ -790,8 +790,8 @@ cdef class Tokenizer(BaseTokenizer):

if next_char == b'\\' and self.flags.allow_escapes:
# Escape text
escape_char = self._next_char()
if escape_char == CHR_EOF:
escape_char, is_eof = self._next_char()
if is_eof:
raise self._error('Unterminated string!')

if escape_char == b'n':
Expand Down Expand Up @@ -819,18 +819,18 @@ cdef class Tokenizer(BaseTokenizer):

self.buf_reset()
while True:
next_char = self._next_char()
if next_char == b'[':
# Don't allow nesting, that's bad.
raise self._error('Cannot nest [] brackets!')
elif next_char == b']':
return PROP_FLAG, self.buf_get_text()
next_char, is_eof = self._next_char()
# Must be one line!
elif next_char == CHR_EOF or next_char == b'\n':
if is_eof or next_char == b'\n':
raise self._error(
'Reached end of line '
'without closing "]"!'
)
if next_char == b'[':
# Don't allow nesting, that's bad.
raise self._error('Cannot nest [] brackets!')
elif next_char == b']':
return PROP_FLAG, self.buf_get_text()
self.buf_add_char(next_char)

elif next_char == b']':
Expand All @@ -844,10 +844,10 @@ cdef class Tokenizer(BaseTokenizer):
# Parentheses around text...
self.buf_reset()
while True:
next_char = self._next_char()
if next_char == CHR_EOF:
next_char, is_eof = self._next_char()
if is_eof:
raise self._error('Unterminated parentheses!')
elif next_char == b'(':
if next_char == b'(':
raise self._error('Cannot nest () brackets!')
elif next_char == b')':
return PAREN_ARGS, self.buf_get_text()
Expand All @@ -863,8 +863,8 @@ cdef class Tokenizer(BaseTokenizer):
self.buf_reset()
ascii_only = True
while True:
next_char = self._next_char()
if next_char == CHR_EOF:
next_char, is_eof = self._next_char()
if is_eof:
# A directive could be the last value in the file.
if ascii_only:
return DIRECTIVE, self.buf_get_text()
Expand Down Expand Up @@ -907,8 +907,8 @@ cdef class Tokenizer(BaseTokenizer):
self.buf_reset()
self.buf_add_char(next_char)
while True:
next_char = self._next_char()
if next_char == CHR_EOF:
next_char, is_eof = self._next_char()
if is_eof:
# Bare names at the end are actually fine.
# It could be a value for the last prop.
return STRING, self.buf_get_text()
Expand All @@ -928,9 +928,9 @@ cdef class Tokenizer(BaseTokenizer):
# Add in a few more bytes, so we can decode the UTF8 fully.
decode = [
next_char,
self._next_char(),
self._next_char(),
self._next_char(),
self._next_char()[0],
self._next_char()[0],
self._next_char()[0],
0x00,
]
raise self._error(f'Unexpected characters "{decode[:4].decode("utf8", "backslashreplace")}"' '!')
Expand Down
3 changes: 3 additions & 0 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
# Additional text not valid as a property.
noprop_parse_test = """
#letter_abcdefghijklmnopqrstuvwxyz_ABCDEFGHIJKLMNOPQRSTUVWXYZ
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
#fimport test
#EXclßÀde value\r
#caseA\u0345\u03a3test
Expand All @@ -114,6 +115,8 @@
noprop_parse_tokens = [
T.NEWLINE,
(T.DIRECTIVE, "letter_abcdefghijklmnopqrstuvwxyz_abcdefghijklmnopqrstuvwxyz"), T.NEWLINE,
# Test all control characters are valid.
(T.STRING, "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"), T.NEWLINE,
(T.DIRECTIVE, "fimport"), (T.STRING, "test"), T.NEWLINE,
(T.DIRECTIVE, "exclssàde"), (T.STRING, "value"), T.NEWLINE,
(T.DIRECTIVE, "casea\u03b9\u03c3test"), T.NEWLINE,
Expand Down

0 comments on commit 245a0a0

Please sign in to comment.