From f4a4aa03d0485def1424e45ba3def6460d2313a5 Mon Sep 17 00:00:00 2001
From: StunxFS <mendozadiazjose77@gmail.com>
Date: Tue, 19 Nov 2024 16:04:12 -0400
Subject: [PATCH] tokenizer: read_number

---
 .gitignore                   |   1 +
 src/compiler/mod.v           |   3 +-
 src/compiler/report/mod.v    |  17 +++-
 src/compiler/tokenizer/mod.v | 182 ++++++++++++++++++++++++++++++++++-
 src/compiler/util/mod.v      |   7 +-
 5 files changed, 199 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 45e500965..78aa53e57 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 fut-out/*.h
 fut-out/*.c
 rivetc
+tokenizer.py
\ No newline at end of file
diff --git a/src/compiler/mod.v b/src/compiler/mod.v
index 2d2ce32c8..d8828bf63 100644
--- a/src/compiler/mod.v
+++ b/src/compiler/mod.v
@@ -6,11 +6,10 @@ module compiler
 
 import compiler.context
 import compiler.report
-
 import compiler.tokenizer as _
 
 pub fn run(args []string) {
 	mut c_ctx := &context.CContext{
-		options: context.parse_args(args) or { report.error(err.msg()) }
+		options: context.parse_args(args) or { report.ic_fatal(err.msg()) }
 	}
 }
diff --git a/src/compiler/report/mod.v b/src/compiler/report/mod.v
index 1d7d7f4f5..c1de4ce3a 100644
--- a/src/compiler/report/mod.v
+++ b/src/compiler/report/mod.v
@@ -5,6 +5,7 @@
 module report
 
 import term
+import compiler.token
 
 enum MsgLevel {
 	note
@@ -32,22 +33,30 @@ fn format_msg(msg string, level MsgLevel) string {
 }
 
 @[inline]
-pub fn note(msg string) {
+pub fn ic_note(msg string) {
 	eprintln(format_msg(msg, .note))
 }
 
 @[inline]
-pub fn warn(msg string) {
+pub fn ic_warn(msg string) {
 	eprintln(format_msg(msg, .warn))
 }
 
 @[noreturn]
-pub fn error(msg string) {
+pub fn ic_error(msg string) {
 	eprintln(format_msg(msg, .error))
 	exit(101)
 }
 
 @[noreturn]
-pub fn ic_error(msg string) {
+pub fn ic_fatal(msg string) {
 	panic(format_msg(msg, .ice))
 }
+
+pub fn error(msg string, pos token.Pos) {}
+
+pub fn warn(msg string, pos token.Pos) {}
+
+pub fn note(msg string) {}
+
+pub fn help(msg string) {}
diff --git a/src/compiler/tokenizer/mod.v b/src/compiler/tokenizer/mod.v
index df0f233e7..535925cb5 100644
--- a/src/compiler/tokenizer/mod.v
+++ b/src/compiler/tokenizer/mod.v
@@ -7,11 +7,12 @@ module tokenizer
 import compiler.context
 import compiler.token
 import compiler.util
+import compiler.report
 
 const lf = 10
 const cr = 13
 const backslash = `\\`
-const num_sep = '_'
+const num_sep = `_`
 
 fn is_new_line(ch u8) bool {
 	return ch in [cr, lf]
@@ -34,10 +35,10 @@ mut:
 	tidx       int = -1
 }
 
-pub fn from_file(ctx  &context.CContext, path string) &Tokenizer {
+pub fn from_file(ctx &context.CContext, path string) &Tokenizer {
 	mut t := &Tokenizer{
-		ctx: ctx
-		text:    util.read_file(path)
+		ctx:  ctx
+		text: util.read_file(path)
 	}
 	t.file = path
 	t.tokenize_remaining_text()
@@ -139,6 +140,179 @@ fn (t &Tokenizer) look_ahead(pos int) u8 {
 	}
 }
 
+fn (mut t Tokenizer) read_ident() string {
+	start := t.pos
+	for t.pos < t.text.len {
+		c := t.text[t.pos]
+		if util.is_valid_name(c) {
+			t.pos++
+			continue
+		}
+		break
+	}
+	lit := t.text[start..t.pos]
+	t.pos--
+	return lit
+}
+
+enum NumberMode {
+	bin
+	oct
+	hex
+	dec
+}
+
+@[inline]
+fn (nm NumberMode) is_valid(c u8) bool {
+	return match nm {
+		.bin { c.is_bin_digit() }
+		.oct { c.is_oct_digit() }
+		.hex { c.is_hex_digit() }
+		.dec { c.is_digit() }
+	}
+}
+
+@[inline]
+fn (nm NumberMode) str() string {
+	return match nm {
+		.bin { 'binary' }
+		.oct { 'octal' }
+		.hex { 'hexadecimal' }
+		.dec { 'decimal' }
+	}
+}
+
+fn (mut t Tokenizer) read_number_(mode NumberMode) string {
+	start := t.pos
+	if mode != .dec {
+		t.pos += 2 // skip '0x', '0b', '0o'
+	}
+	if t.pos < t.text.len && t.current_char() == num_sep {
+		report.error('separator `_` is only valid between digits in a numeric literal',
+			t.current_pos())
+	}
+	for t.pos < t.text.len {
+		ch := t.current_char()
+		if ch == num_sep && t.text[t.pos - 1] == num_sep {
+			report.error('cannot use `_` consecutively in a numeric literal', t.current_pos())
+		}
+		if !mode.is_valid(ch) && ch != num_sep {
+			if mode == .dec && (!ch.is_letter() || ch in [`e`, `E`]) {
+				break
+			} else if !ch.is_digit() && !ch.is_letter() {
+				break
+			}
+			report.error('${mode} number has unsuitable digit `{self.current_char()}`',
+				t.current_pos())
+		}
+		t.pos++
+	}
+	if t.text[t.pos - 1] == num_sep {
+		t.pos--
+		report.error('cannot use `_` at the end of a numeric literal', t.current_pos())
+	}
+	if mode != .dec && start + 2 == t.pos {
+		t.pos--
+		report.error('number part of this ${mode} is not provided', t.current_pos())
+		t.pos++
+	}
+	if mode == .dec {
+		mut call_method := false // `true` for, e.g., 5.method(), 5.5.method(), 5e5.method()
+		mut is_range := false // `true` for, e.g., 5..10
+		// fractional part
+		if t.pos < t.text.len && t.text[t.pos] == `.` {
+			t.pos++
+			if t.pos < t.text.len {
+				// 16.6, 16.6.str()
+				if t.text[t.pos].is_digit() {
+					for t.pos < t.text.len {
+						c := t.text[t.pos]
+						if !c.is_digit() {
+							if !c.is_letter() || c in [`e`, `E`] {
+								// 16.6.str()
+								if c == `.` && t.pos + 1 < t.text.len
+									&& t.text[t.pos + 1].is_letter() {
+									call_method = true
+								}
+								break
+							} else {
+								report.error('number has unsuitable digit `${c}`', t.current_pos())
+							}
+						}
+					}
+				} else if t.text[t.pos] == `.` {
+					// 4.. a range
+					is_range = true
+					t.pos--
+				} else if t.text[t.pos] in [`e`, `E`] {
+					// 6.e6
+				} else if t.text[t.pos].is_letter() {
+					// 16.str()
+					call_method = true
+					t.pos--
+				} else {
+					// 5.
+					t.pos--
+					report.error('float literals should have a digit after the decimal point',
+						t.current_pos())
+					fl := t.text[start..t.pos]
+					report.help('use `${fl}.0` instead of `${fl}`')
+					t.pos++
+				}
+			}
+		}
+		// exponential part
+		mut has_exp := false
+		if t.pos < t.text.len && t.text[t.pos] in [`e`, `E`] {
+			has_exp = true
+			t.pos++
+			if t.pos < t.text.len && t.text[t.pos] in [`-`, `+`] {
+				t.pos++
+			}
+			for t.pos < t.text.len {
+				c := t.text[t.pos]
+				if !c.is_digit() {
+					if !c.is_letter() {
+						// 6e6.str()
+						if c == `.` && t.pos + 1 < t.text.len && t.text[t.pos + 1].is_letter() {
+							call_method = true
+						}
+						break
+					} else {
+						report.error('this number has unsuitable digit `${c}`', t.current_pos())
+					}
+				}
+				t.pos++
+			}
+		}
+		if t.text[t.pos - 1] in [`e`, `E`] {
+			t.pos--
+			report.error('exponent has no digits', t.current_pos())
+			t.pos++
+		} else if t.pos < t.text.len && t.text[t.pos] == `.` && !is_range && !call_method {
+			t.pos--
+			if has_exp {
+				report.error('exponential part should be integer', t.current_pos())
+			} else {
+				report.error('too many decimal points in number', t.current_pos())
+			}
+			t.pos++
+		}
+	}
+	lit := t.text[start..t.pos]
+	t.pos-- // fix pos
+	return lit
+}
+
+fn (mut t Tokenizer) read_number() string {
+	return t.read_number_(match true {
+		t.matches('0b', t.pos) { .bin }
+		t.matches('0o', t.pos) { .oct }
+		t.matches('0x', t.pos) { .hex }
+		else { .dec }
+	})
+}
+
 fn (mut t Tokenizer) next() token.Token {
 	for {
 		cidx := t.tidx
diff --git a/src/compiler/util/mod.v b/src/compiler/util/mod.v
index e3e5faa41..3349c2fa9 100644
--- a/src/compiler/util/mod.v
+++ b/src/compiler/util/mod.v
@@ -7,10 +7,15 @@ module util
 import os
 import compiler.report
 
+@[inline]
+pub fn is_valid_name(c u8) bool {
+	return c == `_` || c.is_alnum()
+}
+
 pub fn read_file(path string) string {
 	return skip_bom(os.read_file(path) or {
 		// we use `ic_error` because this should not happen
-		report.ic_error(err.msg())
+		report.ic_fatal(err.msg())
 	})
 }