diff --git a/src/compiler/ast/File.v b/src/compiler/ast/File.v new file mode 100644 index 000000000..a45925924 --- /dev/null +++ b/src/compiler/ast/File.v @@ -0,0 +1,63 @@ +// Copyright (C) 2024-present Jose Mendoza - All rights reserved. Use of this +// source code is governed by an MIT license that can be found in the LICENSE +// file. + +module ast + +import os + +@[heap] +pub struct File { +pub: + file string + content string +pub mut: + lines ?[]string +} + +pub fn File.new(file string) &File { + content := read_file(file) + return &File{ + file: file + content: content + } +} + +pub fn File.from_memory(content string) &File { + return &File{ + file: '' + content: content + } +} + +pub fn (mut file File) get_lines() []string { + if file.lines != none { + return file.lines + } + lines := file.content.split_into_lines() + file.lines = lines + return lines +} + +pub fn read_file(path string) string { + return skip_bom(os.read_file(path) or { + // we use `panic` because this should not happen + panic(err.msg()) + }) +} + +pub fn skip_bom(file_content string) string { + mut raw_text := file_content + // BOM check + if raw_text.len >= 3 { + unsafe { + c_text := raw_text.str + if c_text[0] == 0xEF && c_text[1] == 0xBB && c_text[2] == 0xBF { + // skip three BOM bytes + offset_from_begin := 3 + raw_text = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin) + } + } + } + return raw_text +} diff --git a/src/compiler/ast/SourceFile.v b/src/compiler/ast/SourceFile.v deleted file mode 100644 index ff47f4f50..000000000 --- a/src/compiler/ast/SourceFile.v +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2024-present Jose Mendoza - All rights reserved. Use of this -// source code is governed by an MIT license that can be found in the LICENSE -// file. - -module ast - -import compiler.util - -@[heap] -pub struct File { -pub: - file string - content string -pub mut: - lines ?[]string -} - -pub fn File.new(file string) &File { - content := util.read_file(file) - return &File{ - file: file - content: content - } -} - -pub fn File.from_memory(content string) &File { - return &File{ - file: '' - content: content - } -} - -pub fn (mut sf File) get_lines() []string { - if sf.lines != none { - return sf.lines - } - sf.lines = sf.content.split_into_lines() - return sf.lines? -} diff --git a/src/compiler/tokenizer/mod.v b/src/compiler/tokenizer/mod.v index 933c3e484..5dc167130 100644 --- a/src/compiler/tokenizer/mod.v +++ b/src/compiler/tokenizer/mod.v @@ -22,7 +22,8 @@ fn is_new_line(ch u8) bool { pub struct Tokenizer { ctx &context.CContext = unsafe { nil } mut: - file &ast.File = unsafe { nil } + file &ast.File = unsafe { nil } + text string line int last_nl_pos int pos int = -1 @@ -36,19 +37,21 @@ mut: pub fn from_file(ctx &context.CContext, file &ast.File) &Tokenizer { mut t := &Tokenizer{ - ctx: ctx - file: file - all_tokens: []token.Token{cap: file.content.len / 3} + ctx: ctx + file: file + all_tokens: []token.Token{cap: file.content.len / 3} } + t.text = file.content t.tokenize_remaining_text() return t } pub fn from_memory(ctx &context.CContext, text string) &Tokenizer { mut t := &Tokenizer{ - ctx: ctx - file: ast.File.from_memory(text) - all_tokens: []token.Token{cap: text.len / 3} + ctx: ctx + file: ast.File.from_memory(text) + text: text + all_tokens: []token.Token{cap: text.len / 3} } t.tokenize_remaining_text() return t @@ -66,7 +69,7 @@ fn (mut t Tokenizer) tokenize_remaining_text() { @[inline] fn (t &Tokenizer) current_char() u8 { - return t.file.content[t.pos] + return t.text[t.pos] } @[inline] @@ -86,13 +89,13 @@ fn (mut t Tokenizer) ignore_line() { @[inline] fn (mut t Tokenizer) eat_to_end_of_line() { - for t.pos < t.file.content.len && t.file.content[t.pos] != lf { + for t.pos < t.text.len && t.text[t.pos] != lf { t.pos++ } } fn (mut t Tokenizer) inc_line_number() { - t.last_nl_pos = int_min(t.file.content.len - 1, t.pos) + t.last_nl_pos = int_min(t.text.len - 1, t.pos) if t.is_cr_lf { t.last_nl_pos++ } @@ -100,7 +103,7 @@ fn (mut t Tokenizer) inc_line_number() { } fn (mut t Tokenizer) skip_whitespace() { - for t.pos < t.file.content.len { + for t.pos < t.text.len { c := t.current_char() if c == 8 { t.pos++ @@ -109,11 +112,10 @@ fn (mut t Tokenizer) skip_whitespace() { if !(c == 32 || (c > 8 && c < 14) || c == 0x85 || c == 0xA0) { return } - if t.pos + 1 < t.file.content.len && c == cr - && t.file.content[t.pos + 1] == lf { + if t.pos + 1 < t.text.len && c == cr && t.text[t.pos + 1] == lf { t.is_cr_lf = true } - if is_new_line(c) && !(t.pos > 0 && t.file.content[t.pos - 1] == cr && c == lf) { + if is_new_line(c) && !(t.pos > 0 && t.text[t.pos - 1] == cr && c == lf) { t.inc_line_number() } t.pos++ @@ -122,12 +124,11 @@ fn (mut t Tokenizer) skip_whitespace() { fn (t &Tokenizer) matches(want string, start_pos int) bool { end_pos := start_pos + want.len - if start_pos < 0 || end_pos < 0 || start_pos >= t.file.content.len - || end_pos > t.file.content.len { + if start_pos < 0 || end_pos < 0 || start_pos >= t.text.len || end_pos > t.text.len { return false } for pos in start_pos .. end_pos { - if t.file.content[pos] != want[pos - start_pos] { + if t.text[pos] != want[pos - start_pos] { return false } } @@ -143,8 +144,8 @@ fn (t &Tokenizer) peek_token(n int) token.Token { } fn (t &Tokenizer) look_ahead(pos int) u8 { - return if t.pos + pos < t.file.content.len { - t.file.content[t.pos + pos] + return if t.pos + pos < t.text.len { + t.text[t.pos + pos] } else { 0 } @@ -152,15 +153,15 @@ fn (t &Tokenizer) look_ahead(pos int) u8 { fn (mut t Tokenizer) read_ident() string { start := t.pos - for t.pos < t.file.content.len { - c := t.file.content[t.pos] + for t.pos < t.text.len { + c := t.text[t.pos] if util.is_valid_name(c) || c.is_digit() { t.pos++ continue } break } - lit := t.file.content[start..t.pos] + lit := t.text[start..t.pos] t.pos-- return lit } @@ -197,13 +198,13 @@ fn (mut t Tokenizer) read_number_mode(mode NumberMode) string { if mode != .dec { t.pos += 2 // skip '0x', '0b', '0o' } - if t.pos < t.file.content.len && t.current_char() == num_sep { + if t.pos < t.text.len && t.current_char() == num_sep { context.error('separator `_` is only valid between digits in a numeric literal', t.current_pos()) } - for t.pos < t.file.content.len { + for t.pos < t.text.len { ch := t.current_char() - if ch == num_sep && t.file.content[t.pos - 1] == num_sep { + if ch == num_sep && t.text[t.pos - 1] == num_sep { context.error('cannot use `_` consecutively in a numeric literal', t.current_pos()) } if !mode.is_valid(ch) && ch != num_sep { @@ -217,7 +218,7 @@ fn (mut t Tokenizer) read_number_mode(mode NumberMode) string { } t.pos++ } - if t.file.content[t.pos - 1] == num_sep { + if t.text[t.pos - 1] == num_sep { t.pos-- context.error('cannot use `_` at the end of a numeric literal', t.current_pos()) } @@ -230,18 +231,18 @@ fn (mut t Tokenizer) read_number_mode(mode NumberMode) string { mut call_method := false // `true` for, e.g., 5.method(), 5.5.method(), 5e5.method() mut is_range := false // `true` for, e.g., 5..10 // fractional part - if t.pos < t.file.content.len && t.file.content[t.pos] == `.` { + if t.pos < t.text.len && t.text[t.pos] == `.` { t.pos++ - if t.pos < t.file.content.len { + if t.pos < t.text.len { // 16.6, 16.6.str() - if t.file.content[t.pos].is_digit() { - for t.pos < t.file.content.len { - c := t.file.content[t.pos] + if t.text[t.pos].is_digit() { + for t.pos < t.text.len { + c := t.text[t.pos] if !c.is_digit() { if !c.is_letter() || c in [`e`, `E`] { // 16.6.str() - if c == `.` && t.pos + 1 < t.file.content.len - && t.file.content[t.pos + 1].is_letter() { + if c == `.` && t.pos + 1 < t.text.len + && t.text[t.pos + 1].is_letter() { call_method = true } break @@ -251,13 +252,13 @@ fn (mut t Tokenizer) read_number_mode(mode NumberMode) string { } t.pos++ } - } else if t.file.content[t.pos] == `.` { + } else if t.text[t.pos] == `.` { // 4.. a range is_range = true t.pos-- - } else if t.file.content[t.pos] in [`e`, `E`] { + } else if t.text[t.pos] in [`e`, `E`] { // 6.e6 - } else if t.file.content[t.pos].is_letter() { + } else if t.text[t.pos].is_letter() { // 16.str() call_method = true t.pos-- @@ -266,7 +267,7 @@ fn (mut t Tokenizer) read_number_mode(mode NumberMode) string { t.pos-- context.error('float literals should have a digit after the decimal point', t.current_pos()) - fl := t.file.content[start..t.pos] + fl := t.text[start..t.pos] context.help('use `${fl}.0` instead of `${fl}`') t.pos++ } @@ -274,19 +275,18 @@ fn (mut t Tokenizer) read_number_mode(mode NumberMode) string { } // exponential part mut has_exp := false - if t.pos < t.file.content.len && t.file.content[t.pos] in [`e`, `E`] { + if t.pos < t.text.len && t.text[t.pos] in [`e`, `E`] { has_exp = true t.pos++ - if t.pos < t.file.content.len && t.file.content[t.pos] in [`-`, `+`] { + if t.pos < t.text.len && t.text[t.pos] in [`-`, `+`] { t.pos++ } - for t.pos < t.file.content.len { - c := t.file.content[t.pos] + for t.pos < t.text.len { + c := t.text[t.pos] if !c.is_digit() { if !c.is_letter() { // 6e6.str() - if c == `.` && t.pos + 1 < t.file.content.len - && t.file.content[t.pos + 1].is_letter() { + if c == `.` && t.pos + 1 < t.text.len && t.text[t.pos + 1].is_letter() { call_method = true } break @@ -297,12 +297,11 @@ fn (mut t Tokenizer) read_number_mode(mode NumberMode) string { t.pos++ } } - if t.file.content[t.pos - 1] in [`e`, `E`] { + if t.text[t.pos - 1] in [`e`, `E`] { t.pos-- context.error('exponent has no digits', t.current_pos()) t.pos++ - } else if t.pos < t.file.content.len && t.file.content[t.pos] == `.` - && !is_range && !call_method { + } else if t.pos < t.text.len && t.text[t.pos] == `.` && !is_range && !call_method { t.pos-- if has_exp { context.error('exponential part should be integer', t.current_pos()) @@ -312,7 +311,7 @@ fn (mut t Tokenizer) read_number_mode(mode NumberMode) string { t.pos++ } } - lit := t.file.content[start..t.pos] + lit := t.text[start..t.pos] t.pos-- // fix pos return lit } @@ -328,20 +327,19 @@ fn (mut t Tokenizer) read_number() string { fn (mut t Tokenizer) read_char() string { start := t.pos - // is_bytelit := t.pos > 0 && t.file.content[t.pos - 1] == `b` + // is_bytelit := t.pos > 0 && t.text[t.pos - 1] == `b` mut len := 0 for { t.pos++ - if t.pos >= t.file.content.len { + if t.pos >= t.text.len { break } if t.current_char() != backslash { len++ } double_slash := t.matches('\\\\', t.pos - 2) - if t.current_char() == `'` - && (t.file.content[t.pos - 1] != backslash || double_slash) { + if t.current_char() == `'` && (t.text[t.pos - 1] != backslash || double_slash) { if double_slash { len++ } @@ -350,7 +348,7 @@ fn (mut t Tokenizer) read_char() string { } len-- - ch := t.file.content[start + 1..t.pos] + ch := t.text[start + 1..t.pos] if len == 0 { context.error('empty character literal', t.current_pos()) } else if len != 1 { @@ -364,13 +362,13 @@ fn (mut t Tokenizer) read_string() string { start_pos := t.current_pos() start := t.pos start_char := t.current_char() - is_raw := t.pos > 0 && t.file.content[t.pos - 1] == `r` - // is_cstr := t.pos > 0 && t.file.content[t.pos - 1] == `c` + is_raw := t.pos > 0 && t.text[t.pos - 1] == `r` + // is_cstr := t.pos > 0 && t.text[t.pos - 1] == `c` mut backslash_count := if start_char == backslash { 1 } else { 0 } mut n_cr_chars := 0 for { t.pos++ - if t.pos >= t.file.content.len { + if t.pos >= t.text.len { t.pos = start context.error('unfinished string literal', start_pos) return '' @@ -395,7 +393,7 @@ fn (mut t Tokenizer) read_string() string { } mut lit := '' if start <= t.pos { - lit = t.file.content[start + 1..t.pos] + lit = t.text[start + 1..t.pos] if n_cr_chars > 0 { lit = lit.replace('\r', '') } @@ -431,7 +429,7 @@ fn (mut t Tokenizer) internal_next() token.Token { for { t.pos++ t.skip_whitespace() - if t.pos >= t.file.content.len { + if t.pos >= t.text.len { return t.token_eof() } pos := t.current_pos() @@ -466,19 +464,19 @@ fn (mut t Tokenizer) internal_next() token.Token { start_pos := t.pos mut nest_count := 1 t.pos++ - for nest_count > 0 && t.pos < t.file.content.len - 1 { + for nest_count > 0 && t.pos < t.text.len - 1 { t.pos++ - if t.pos >= t.file.content.len - 1 { + if t.pos >= t.text.len - 1 { old_pos := t.pos t.pos = start_pos context.error('unterminated multiline comment', t.current_pos()) t.pos = old_pos } - if t.file.content[t.pos] == lf { + if t.text[t.pos] == lf { t.inc_line_number() continue } - if t.matches('/*', t.pos) && t.file.content[t.pos + 2] != `/` { + if t.matches('/*', t.pos) && t.text[t.pos + 2] != `/` { nest_count++ continue } diff --git a/src/compiler/util/mod.v b/src/compiler/util/mod.v index 8eeed8e3c..f9e9225f2 100644 --- a/src/compiler/util/mod.v +++ b/src/compiler/util/mod.v @@ -4,32 +4,7 @@ module util -import os - @[inline] pub fn is_valid_name(c u8) bool { return c == `_` || c.is_letter() } - -pub fn read_file(path string) string { - return skip_bom(os.read_file(path) or { - // we use `panic` because this should not happen - panic(err.msg()) - }) -} - -pub fn skip_bom(file_content string) string { - mut raw_text := file_content - // BOM check - if raw_text.len >= 3 { - unsafe { - c_text := raw_text.str - if c_text[0] == 0xEF && c_text[1] == 0xBB && c_text[2] == 0xBF { - // skip three BOM bytes - offset_from_begin := 3 - raw_text = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin) - } - } - } - return raw_text -} diff --git a/test.ri b/test.ri index 8a30420d3..7f880c6a8 100644 --- a/test.ri +++ b/test.ri @@ -3,7 +3,6 @@ fn main 123 123.0 0x123abc 0b10101 0o1234 // Test file fn main -() /* multi-line comment usb - pc - laptop */