From d97ccc5a05c916b419a590b994ecf718213af2a6 Mon Sep 17 00:00:00 2001 From: Mathieu Fehr Date: Fri, 9 Jan 2026 17:09:48 +0000 Subject: [PATCH 1/4] Add lexing of numbers This is the last piece needed to fully lex MLIR files. --- Test/Lexer.lean | 60 +++++++++++++++++++++++++++++++++ Veir/Parser/Lexer.lean | 76 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) diff --git a/Test/Lexer.lean b/Test/Lexer.lean index 7865406..bb91c94 100644 --- a/Test/Lexer.lean +++ b/Test/Lexer.lean @@ -339,3 +339,63 @@ def lexResult (s : String) : String := -/ #guard_msgs in #eval lexResult "^ foo" + +/-- + info: "Value: `0`, Kind: intLit" +-/ +#guard_msgs in +#eval lexResult "0x" + +/-- + info: "Value: `0`, Kind: intLit" +-/ +#guard_msgs in +#eval lexResult "0xi" + +/-- + info: "Value: `0`, Kind: intLit" +-/ +#guard_msgs in +#eval lexResult "0x0123456789abcdefABCDEF" + +/-- + info: "Value: `42`, Kind: intLit" +-/ +#guard_msgs in +#eval lexResult "42" + +/-- + info: "Value: `42.`, Kind: floatLit" +-/ +#guard_msgs in +#eval lexResult "42." + +/-- + info: "Value: `42.0`, Kind: floatLit" +-/ +#guard_msgs in +#eval lexResult "42.0" + +/-- + info: "Value: `3.1415926`, Kind: floatLit" +-/ +#guard_msgs in +#eval lexResult "3.1415926" + +/-- + info: "Value: `34.e0`, Kind: floatLit" +-/ +#guard_msgs in +#eval lexResult "34.e0" + +/-- + info: "Value: `34.e-23`, Kind: floatLit" +-/ +#guard_msgs in +#eval lexResult "34.e-23" + +/-- + info: "Value: `34.e12`, Kind: floatLit" +-/ +#guard_msgs in +#eval lexResult "34.e12" diff --git a/Veir/Parser/Lexer.lean b/Veir/Parser/Lexer.lean index 4f2e99b..c6f9a3c 100644 --- a/Veir/Parser/Lexer.lean +++ b/Veir/Parser/Lexer.lean @@ -315,6 +315,80 @@ def lexPrefixedIdentifier (state : LexerState) (tokStart : Nat) else .error errorString +/-- + Lex a number literal. + + integer-literal ::= digit+ | `0x` hex_digit+ + float-literal ::= [0-9]+[.][0-9]*([eE][-+]?[0-9]+)? +-/ +def lexNumber (state : LexerState) (firstChar: UInt8) : Token × LexerState := Id.run do + let mut pos := state.pos + let input := state.input + + if h: state.isEof then + return (state.mkToken .intLit state.pos, state) + else + let c1 := input[pos]'(by grind [LexerState.isEof]) + -- Hexadecimal literal + if firstChar == '0'.toUInt8 && c1 == 'x'.toUInt8 then + -- In MLIR, `0xi` is parsed as `0` followed by `xi`. + if !(input.data.getD (pos + 1) 0).isHexDigit then + return (state.mkToken .intLit state.pos, state) + pos := pos + 2 + while h: pos < input.size do + let c := input[pos] + if c.isHexDigit then + pos := pos + 1 + else + break + let newState := { state with pos := pos } + return (newState.mkToken .intLit state.pos, newState) + + -- Handle the normal decimal case, with the starting digits + while h: pos < input.size do + let c := input[pos] + if UInt8.isDigit c then + pos := pos + 1 + else + break + + -- Check for fractional part + if input.data.getD pos 0 != '.'.toUInt8 then + let newState := { state with pos := pos } + return (newState.mkToken .intLit state.pos, newState) + pos := pos + 1 + + -- Parse the fractional digits + while h: pos < input.size do + let c := input[pos] + if UInt8.isDigit c then + pos := pos + 1 + else + break + + -- Check for exponent part + if input.data.getD pos 0 != 'e'.toUInt8 && input.data.getD pos 0 != 'E'.toUInt8 then + let newState := { state with pos := pos } + return (newState.mkToken .floatLit state.pos, newState) + pos := pos + 1 + + if (input.data.getD pos 0).isDigit || + ((input.data.getD pos 0 == '+'.toUInt8 || input.data.getD pos 0 == '-'.toUInt8) && + (input.data.getD (pos + 1) 0).isDigit) then + pos := pos + 1 + while h: pos < input.size do + let c := input[pos] + if UInt8.isDigit c then + pos := pos + 1 + else + break + let newState := { state with pos := pos } + return (newState.mkToken .floatLit state.pos, newState) + else + let newState := { state with pos := pos } + return (newState.mkToken .floatLit state.pos, newState) + + /-- Lex the next token from the input. -/ @@ -447,6 +521,8 @@ partial def lex (state : LexerState) : Except String (Token × LexerState) := else if c == '!'.toUInt8 then let newState := { state with pos := state.pos + 1 } lexPrefixedIdentifier newState tokStart .exclamationIdent + else if UInt8.isDigit c then + return lexNumber state c else .error s!"Unexpected character '{Char.ofUInt8 c}' at position {state.pos}" From 65c72bec93167b502e6ea159f73ebc74ca7aa3f1 Mon Sep 17 00:00:00 2001 From: Mathieu Fehr Date: Fri, 9 Jan 2026 23:29:33 +0000 Subject: [PATCH 2/4] Fix a test, and make sure number lexing is linear --- Test/Lexer.lean | 2 +- Veir/Parser/Lexer.lean | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Test/Lexer.lean b/Test/Lexer.lean index bb91c94..f67457e 100644 --- a/Test/Lexer.lean +++ b/Test/Lexer.lean @@ -353,7 +353,7 @@ def lexResult (s : String) : String := #eval lexResult "0xi" /-- - info: "Value: `0`, Kind: intLit" + info: "Value: `0x0123456789abcdefABCDEF`, Kind: intLit" -/ #guard_msgs in #eval lexResult "0x0123456789abcdefABCDEF" diff --git a/Veir/Parser/Lexer.lean b/Veir/Parser/Lexer.lean index c6f9a3c..61039c5 100644 --- a/Veir/Parser/Lexer.lean +++ b/Veir/Parser/Lexer.lean @@ -321,19 +321,19 @@ def lexPrefixedIdentifier (state : LexerState) (tokStart : Nat) integer-literal ::= digit+ | `0x` hex_digit+ float-literal ::= [0-9]+[.][0-9]*([eE][-+]?[0-9]+)? -/ -def lexNumber (state : LexerState) (firstChar: UInt8) : Token × LexerState := Id.run do +def lexNumber (state : LexerState) (firstChar : UInt8) (tokStart : Nat) : Token × LexerState := Id.run do let mut pos := state.pos let input := state.input if h: state.isEof then - return (state.mkToken .intLit state.pos, state) + return (state.mkToken .intLit tokStart, state) else let c1 := input[pos]'(by grind [LexerState.isEof]) -- Hexadecimal literal if firstChar == '0'.toUInt8 && c1 == 'x'.toUInt8 then -- In MLIR, `0xi` is parsed as `0` followed by `xi`. - if !(input.data.getD (pos + 1) 0).isHexDigit then - return (state.mkToken .intLit state.pos, state) + if !(input.getD (pos + 1) 0).isHexDigit then + return (state.mkToken .intLit tokStart, state) pos := pos + 2 while h: pos < input.size do let c := input[pos] @@ -342,7 +342,7 @@ def lexNumber (state : LexerState) (firstChar: UInt8) : Token × LexerState := I else break let newState := { state with pos := pos } - return (newState.mkToken .intLit state.pos, newState) + return (newState.mkToken .intLit tokStart, newState) -- Handle the normal decimal case, with the starting digits while h: pos < input.size do @@ -353,9 +353,9 @@ def lexNumber (state : LexerState) (firstChar: UInt8) : Token × LexerState := I break -- Check for fractional part - if input.data.getD pos 0 != '.'.toUInt8 then + if input.getD pos 0 != '.'.toUInt8 then let newState := { state with pos := pos } - return (newState.mkToken .intLit state.pos, newState) + return (newState.mkToken .intLit tokStart, newState) pos := pos + 1 -- Parse the fractional digits @@ -367,14 +367,14 @@ def lexNumber (state : LexerState) (firstChar: UInt8) : Token × LexerState := I break -- Check for exponent part - if input.data.getD pos 0 != 'e'.toUInt8 && input.data.getD pos 0 != 'E'.toUInt8 then + if input.getD pos 0 != 'e'.toUInt8 && input.getD pos 0 != 'E'.toUInt8 then let newState := { state with pos := pos } - return (newState.mkToken .floatLit state.pos, newState) + return (newState.mkToken .floatLit tokStart, newState) pos := pos + 1 - if (input.data.getD pos 0).isDigit || - ((input.data.getD pos 0 == '+'.toUInt8 || input.data.getD pos 0 == '-'.toUInt8) && - (input.data.getD (pos + 1) 0).isDigit) then + if (input.getD pos 0).isDigit || + ((input.getD pos 0 == '+'.toUInt8 || input.getD pos 0 == '-'.toUInt8) && + (input.getD (pos + 1) 0).isDigit) then pos := pos + 1 while h: pos < input.size do let c := input[pos] @@ -383,10 +383,10 @@ def lexNumber (state : LexerState) (firstChar: UInt8) : Token × LexerState := I else break let newState := { state with pos := pos } - return (newState.mkToken .floatLit state.pos, newState) + return (newState.mkToken .floatLit tokStart, newState) else let newState := { state with pos := pos } - return (newState.mkToken .floatLit state.pos, newState) + return (newState.mkToken .floatLit tokStart, newState) /-- @@ -522,7 +522,8 @@ partial def lex (state : LexerState) : Except String (Token × LexerState) := let newState := { state with pos := state.pos + 1 } lexPrefixedIdentifier newState tokStart .exclamationIdent else if UInt8.isDigit c then - return lexNumber state c + let newState := { state with pos := state.pos + 1 } + return lexNumber newState c tokStart else .error s!"Unexpected character '{Char.ofUInt8 c}' at position {state.pos}" From de05055d9d25987b2e2f30b2878bf172ba75917e Mon Sep 17 00:00:00 2001 From: Mathieu Fehr Date: Fri, 9 Jan 2026 23:30:12 +0000 Subject: [PATCH 3/4] Add missing util --- Veir/ForLean.lean | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Veir/ForLean.lean b/Veir/ForLean.lean index 5a2a8b7..d45f410 100644 --- a/Veir/ForLean.lean +++ b/Veir/ForLean.lean @@ -16,6 +16,12 @@ def UInt8.isDigit (c : UInt8) : Bool := def UInt8.isHexDigit (c : UInt8) : Bool := c.isDigit || (c >= 'a'.toUInt8 && c <= 'f'.toUInt8) || (c >= 'A'.toUInt8 && c <= 'F'.toUInt8) +@[inline] +def ByteArray.getD (ba : ByteArray) (i : Nat) (default : UInt8) : UInt8 := + if h : i < ba.size then ba[i] else default + +end + set_option warn.sorry false in @[grind .] theorem Array.back_popWhile {as : Array α} {p : α → Bool} (h : 0 < (as.popWhile p).size) : From f3138a19a3e65ac5cf7e8c3d4fb8112e917874e8 Mon Sep 17 00:00:00 2001 From: Mathieu Fehr Date: Fri, 9 Jan 2026 23:34:32 +0000 Subject: [PATCH 4/4] Remove autocompleted line --- Veir/ForLean.lean | 2 -- 1 file changed, 2 deletions(-) diff --git a/Veir/ForLean.lean b/Veir/ForLean.lean index d45f410..5e98b3b 100644 --- a/Veir/ForLean.lean +++ b/Veir/ForLean.lean @@ -20,8 +20,6 @@ def UInt8.isHexDigit (c : UInt8) : Bool := def ByteArray.getD (ba : ByteArray) (i : Nat) (default : UInt8) : UInt8 := if h : i < ba.size then ba[i] else default -end - set_option warn.sorry false in @[grind .] theorem Array.back_popWhile {as : Array α} {p : α → Bool} (h : 0 < (as.popWhile p).size) :