From f4a4aa03d0485def1424e45ba3def6460d2313a5 Mon Sep 17 00:00:00 2001 From: StunxFS Date: Tue, 19 Nov 2024 16:04:12 -0400 Subject: [PATCH] tokenizer: read_number --- .gitignore | 1 + src/compiler/mod.v | 3 +- src/compiler/report/mod.v | 17 +++- src/compiler/tokenizer/mod.v | 182 ++++++++++++++++++++++++++++++++++- src/compiler/util/mod.v | 7 +- 5 files changed, 199 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 45e500965..78aa53e57 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ fut-out/*.h fut-out/*.c rivetc +tokenizer.py \ No newline at end of file diff --git a/src/compiler/mod.v b/src/compiler/mod.v index 2d2ce32c8..d8828bf63 100644 --- a/src/compiler/mod.v +++ b/src/compiler/mod.v @@ -6,11 +6,10 @@ module compiler import compiler.context import compiler.report - import compiler.tokenizer as _ pub fn run(args []string) { mut c_ctx := &context.CContext{ - options: context.parse_args(args) or { report.error(err.msg()) } + options: context.parse_args(args) or { report.ic_fatal(err.msg()) } } } diff --git a/src/compiler/report/mod.v b/src/compiler/report/mod.v index 1d7d7f4f5..c1de4ce3a 100644 --- a/src/compiler/report/mod.v +++ b/src/compiler/report/mod.v @@ -5,6 +5,7 @@ module report import term +import compiler.token enum MsgLevel { note @@ -32,22 +33,30 @@ fn format_msg(msg string, level MsgLevel) string { } @[inline] -pub fn note(msg string) { +pub fn ic_note(msg string) { eprintln(format_msg(msg, .note)) } @[inline] -pub fn warn(msg string) { +pub fn ic_warn(msg string) { eprintln(format_msg(msg, .warn)) } @[noreturn] -pub fn error(msg string) { +pub fn ic_error(msg string) { eprintln(format_msg(msg, .error)) exit(101) } @[noreturn] -pub fn ic_error(msg string) { +pub fn ic_fatal(msg string) { panic(format_msg(msg, .ice)) } + +pub fn error(msg string, pos token.Pos) {} + +pub fn warn(msg string, pos token.Pos) {} + +pub fn note(msg string) {} + +pub fn help(msg string) {} diff --git a/src/compiler/tokenizer/mod.v b/src/compiler/tokenizer/mod.v index df0f233e7..535925cb5 100644 --- a/src/compiler/tokenizer/mod.v +++ b/src/compiler/tokenizer/mod.v @@ -7,11 +7,12 @@ module tokenizer import compiler.context import compiler.token import compiler.util +import compiler.report const lf = 10 const cr = 13 const backslash = `\\` -const num_sep = '_' +const num_sep = `_` fn is_new_line(ch u8) bool { return ch in [cr, lf] @@ -34,10 +35,10 @@ mut: tidx int = -1 } -pub fn from_file(ctx &context.CContext, path string) &Tokenizer { +pub fn from_file(ctx &context.CContext, path string) &Tokenizer { mut t := &Tokenizer{ - ctx: ctx - text: util.read_file(path) + ctx: ctx + text: util.read_file(path) } t.file = path t.tokenize_remaining_text() @@ -139,6 +140,179 @@ fn (t &Tokenizer) look_ahead(pos int) u8 { } } +fn (mut t Tokenizer) read_ident() string { + start := t.pos + for t.pos < t.text.len { + c := t.text[t.pos] + if util.is_valid_name(c) { + t.pos++ + continue + } + break + } + lit := t.text[start..t.pos] + t.pos-- + return lit +} + +enum NumberMode { + bin + oct + hex + dec +} + +@[inline] +fn (nm NumberMode) is_valid(c u8) bool { + return match nm { + .bin { c.is_bin_digit() } + .oct { c.is_oct_digit() } + .hex { c.is_hex_digit() } + .dec { c.is_digit() } + } +} + +@[inline] +fn (nm NumberMode) str() string { + return match nm { + .bin { 'binary' } + .oct { 'octal' } + .hex { 'hexadecimal' } + .dec { 'decimal' } + } +} + +fn (mut t Tokenizer) read_number_(mode NumberMode) string { + start := t.pos + if mode != .dec { + t.pos += 2 // skip '0x', '0b', '0o' + } + if t.pos < t.text.len && t.current_char() == num_sep { + report.error('separator `_` is only valid between digits in a numeric literal', + t.current_pos()) + } + for t.pos < t.text.len { + ch := t.current_char() + if ch == num_sep && t.text[t.pos - 1] == num_sep { + report.error('cannot use `_` consecutively in a numeric literal', t.current_pos()) + } + if !mode.is_valid(ch) && ch != num_sep { + if mode == .dec && (!ch.is_letter() || ch in [`e`, `E`]) { + break + } else if !ch.is_digit() && !ch.is_letter() { + break + } + report.error('${mode} number has unsuitable digit `{self.current_char()}`', + t.current_pos()) + } + t.pos++ + } + if t.text[t.pos - 1] == num_sep { + t.pos-- + report.error('cannot use `_` at the end of a numeric literal', t.current_pos()) + } + if mode != .dec && start + 2 == t.pos { + t.pos-- + report.error('number part of this ${mode} is not provided', t.current_pos()) + t.pos++ + } + if mode == .dec { + mut call_method := false // `true` for, e.g., 5.method(), 5.5.method(), 5e5.method() + mut is_range := false // `true` for, e.g., 5..10 + // fractional part + if t.pos < t.text.len && t.text[t.pos] == `.` { + t.pos++ + if t.pos < t.text.len { + // 16.6, 16.6.str() + if t.text[t.pos].is_digit() { + for t.pos < t.text.len { + c := t.text[t.pos] + if !c.is_digit() { + if !c.is_letter() || c in [`e`, `E`] { + // 16.6.str() + if c == `.` && t.pos + 1 < t.text.len + && t.text[t.pos + 1].is_letter() { + call_method = true + } + break + } else { + report.error('number has unsuitable digit `${c}`', t.current_pos()) + } + } + } + } else if t.text[t.pos] == `.` { + // 4.. a range + is_range = true + t.pos-- + } else if t.text[t.pos] in [`e`, `E`] { + // 6.e6 + } else if t.text[t.pos].is_letter() { + // 16.str() + call_method = true + t.pos-- + } else { + // 5. + t.pos-- + report.error('float literals should have a digit after the decimal point', + t.current_pos()) + fl := t.text[start..t.pos] + report.help('use `${fl}.0` instead of `${fl}`') + t.pos++ + } + } + } + // exponential part + mut has_exp := false + if t.pos < t.text.len && t.text[t.pos] in [`e`, `E`] { + has_exp = true + t.pos++ + if t.pos < t.text.len && t.text[t.pos] in [`-`, `+`] { + t.pos++ + } + for t.pos < t.text.len { + c := t.text[t.pos] + if !c.is_digit() { + if !c.is_letter() { + // 6e6.str() + if c == `.` && t.pos + 1 < t.text.len && t.text[t.pos + 1].is_letter() { + call_method = true + } + break + } else { + report.error('this number has unsuitable digit `${c}`', t.current_pos()) + } + } + t.pos++ + } + } + if t.text[t.pos - 1] in [`e`, `E`] { + t.pos-- + report.error('exponent has no digits', t.current_pos()) + t.pos++ + } else if t.pos < t.text.len && t.text[t.pos] == `.` && !is_range && !call_method { + t.pos-- + if has_exp { + report.error('exponential part should be integer', t.current_pos()) + } else { + report.error('too many decimal points in number', t.current_pos()) + } + t.pos++ + } + } + lit := t.text[start..t.pos] + t.pos-- // fix pos + return lit +} + +fn (mut t Tokenizer) read_number() string { + return t.read_number_(match true { + t.matches('0b', t.pos) { .bin } + t.matches('0o', t.pos) { .oct } + t.matches('0x', t.pos) { .hex } + else { .dec } + }) +} + fn (mut t Tokenizer) next() token.Token { for { cidx := t.tidx diff --git a/src/compiler/util/mod.v b/src/compiler/util/mod.v index e3e5faa41..3349c2fa9 100644 --- a/src/compiler/util/mod.v +++ b/src/compiler/util/mod.v @@ -7,10 +7,15 @@ module util import os import compiler.report +@[inline] +pub fn is_valid_name(c u8) bool { + return c == `_` || c.is_alnum() +} + pub fn read_file(path string) string { return skip_bom(os.read_file(path) or { // we use `ic_error` because this should not happen - report.ic_error(err.msg()) + report.ic_fatal(err.msg()) }) }