Improve error position reported by the lexer (#186)

makenowjust · web-flow · commit e50eded97e2f · 2024-10-29T22:14:20.000+09:00
This introduces two helper methods `errorfAtPosition` and
`panicfAtPosition`, and uses them for reporting errors caused by
tokenizing string-like literals and skipping comments.
diff --git a/lexer.go b/lexer.go
@@ -418,7 +418,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
 	for l.peekOk(i) {
 		if l.slice(i, i+len(q)) == q {
 			if len(content) == 0 && name == "identifier" {
-				l.panicf("invalid empty identifier")
+				l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+len(q)), "invalid empty identifier")
 			}
 			l.skipN(i + len(q))
 			return string(content)
@@ -428,7 +428,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
 		if c == '\\' {
 			i++
 			if !l.peekOk(i) {
-				l.panicf("invalid escape sequence: \\<eof>")
+				l.panicfAtPosition(token.Pos(l.pos+i-1), token.Pos(l.pos+i), "invalid escape sequence: \\<eof>")
 			}
 
 			c := l.peek(i)
@@ -457,65 +457,69 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
 			case '\\', '?', '"', '\'', '`':
 				content = append(content, c)
 			case 'x', 'X':
-				if !(l.peekOk(i+1) && char.IsHexDigit(l.peek(i)) && char.IsHexDigit(l.peek(i+1))) {
-					l.panicf("invalid escape sequence: hex escape sequence must be follwed by 2 hex digits")
+				for j := 0; j < 2; j++ {
+					if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) {
+						l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits")
+					}
 				}
 				u, err := strconv.ParseUint(l.slice(i, i+2), 16, 8)
 				if err != nil {
-					l.panicf("invalid escape sequence: %v", err)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err)
 				}
 				content = append(content, byte(u))
 				i += 2
 			case 'u', 'U':
 				if !unicode {
-					l.panicf("invalid escape sequence: \\%c is not allowed in %s", c, name)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c is not allowed in %s", c, name)
 				}
 				size := 4
 				if c == 'U' {
 					size = 8
 				}
 				for j := 0; j < size; j++ {
 					if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) {
-						l.panicf("invalid escape sequence: \\%c must be followed by %d hex digits", c, size)
+						l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: \\%c must be followed by %d hex digits", c, size)
 					}
 				}
 				u, err := strconv.ParseUint(l.slice(i, i+size), 16, 32)
 				if err != nil {
-					l.panicf("invalid escape sequence: %v", err)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: %v", err)
 				}
 				if 0xD800 <= u && u <= 0xDFFF || 0x10FFFF < u {
-					l.panicf("invalid escape sequence: invalid code point: U+%04X", u)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: invalid code point: U+%04X", u)
 				}
 				var buf [utf8.MaxRune]byte
 				n := utf8.EncodeRune(buf[:], rune(u))
 				content = append(content, buf[:n]...)
 				i += size
 			case '0', '1', '2', '3':
-				if !(l.peekOk(i+1) && char.IsOctalDigit(l.peek(i)) && char.IsOctalDigit(l.peek(i+1))) {
-					l.panicf("invalid escape sequence: octal escape sequence must be follwed by 3 octal digits")
+				for j := 0; j < 2; j++ {
+					if !(l.peekOk(i+j) && char.IsOctalDigit(l.peek(i+j))) {
+						l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits")
+					}
 				}
 				u, err := strconv.ParseUint(l.slice(i-1, i+2), 8, 8)
 				if err != nil {
-					l.panicf("invalid escape sequence: %v", err)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err)
 				}
 				content = append(content, byte(u))
 				i += 2
 			default:
-				l.panicf("invalid escape sequence: \\%c", c)
+				l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c", c)
 			}
 
 			continue
 		}
 
 		if c == '\n' && len(q) != 3 {
-			l.panicf("unclosed %s: newline appears in non triple-quoted", name)
+			l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+1), "unclosed %s: newline appears in non triple-quoted", name)
 		}
 
 		content = append(content, c)
 		i++
 	}
 
-	panic(l.errorf("unclosed %s", name))
+	panic(l.errorfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i), "unclosed %s", name))
 }
 
 func (l *Lexer) skipSpaces() {
@@ -543,6 +547,7 @@ func (l *Lexer) skipComment() {
 }
 
 func (l *Lexer) skipCommentUntil(end string, mustEnd bool) {
+	pos := token.Pos(l.pos)
 	for !l.eof() {
 		if l.slice(0, len(end)) == end {
 			l.skipN(len(end))
@@ -551,8 +556,7 @@ func (l *Lexer) skipCommentUntil(end string, mustEnd bool) {
 		l.skip()
 	}
 	if mustEnd {
-		// TODO: improve error position
-		l.panicf("unclosed comment")
+		l.panicfAtPosition(pos, token.Pos(l.pos), "unclosed comment")
 	}
 }
 
@@ -596,6 +600,17 @@ func (l *Lexer) errorf(msg string, param ...interface{}) *Error {
 	}
 }
 
+func (l *Lexer) errorfAtPosition(pos, end token.Pos, msg string, param ...interface{}) *Error {
+	return &Error{
+		Message:  fmt.Sprintf(msg, param...),
+		Position: l.Position(pos, end),
+	}
+}
+
 func (l *Lexer) panicf(msg string, param ...interface{}) {
 	panic(l.errorf(msg, param...))
 }
+
+func (l *Lexer) panicfAtPosition(pos, end token.Pos, msg string, param ...interface{}) {
+	panic(l.errorfAtPosition(pos, end, msg, param...))
+}
diff --git a/lexer_test.go b/lexer_test.go
@@ -143,26 +143,27 @@ var lexerTestCases = []struct {
 var lexerWrongTestCase = []struct {
 	source  string
 	pos     Pos
+	end     Pos
 	message string
 }{
-	{"\b", 0, "illegal input character: '\\b'"},
-	{`"foo`, 0, "unclosed string literal"},
-	{`R"foo`, 1, "unclosed raw string literal"},
-	{"'foo\n", 0, "unclosed string literal: newline appears in non triple-quoted"},
-	{"R'foo\n", 1, "unclosed raw string literal: newline appears in non triple-quoted"},
-	{"R'foo\\", 1, "invalid escape sequence: \\<eof>"},
-	{`"\400"`, 0, "invalid escape sequence: \\4"},
-	{`"\3xx"`, 0, "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits"},
-	{`"\xZZ"`, 0, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
-	{`"\XZZ"`, 0, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
-	{`B"\u0031"`, 1, "invalid escape sequence: \\u is not allowed in bytes literal"},
-	{`B"\U00000031"`, 1, "invalid escape sequence: \\U is not allowed in bytes literal"},
-	{`B"\U00000031"`, 1, "invalid escape sequence: \\U is not allowed in bytes literal"},
-	{`"\UFFFFFFFF"`, 0, "invalid escape sequence: invalid code point: U+FFFFFFFF"},
-	{"``", 0, "invalid empty identifier"},
-	{"1from", 1, "number literal cannot follow identifier without any spaces"},
-	{`'''0`, 0, "unclosed triple-quoted string literal"},
-	{`/*`, 2, "unclosed comment"},
+	{"\b", 0, 0, "illegal input character: '\\b'"},
+	{`"foo`, 0, 4, "unclosed string literal"},
+	{`R"foo`, 1, 5, "unclosed raw string literal"},
+	{"'foo\n", 0, 5, "unclosed string literal: newline appears in non triple-quoted"},
+	{"R'foo\n", 1, 6, "unclosed raw string literal: newline appears in non triple-quoted"},
+	{"R'foo\\", 5, 6, "invalid escape sequence: \\<eof>"},
+	{`"\400"`, 1, 3, "invalid escape sequence: \\4"},
+	{`"\3xx"`, 1, 4, "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits"},
+	{`"\xZZ"`, 1, 4, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
+	{`"\XZZ"`, 1, 4, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
+	{`B"\u0031"`, 2, 4, "invalid escape sequence: \\u is not allowed in bytes literal"},
+	{`B"\U00000031"`, 2, 4, "invalid escape sequence: \\U is not allowed in bytes literal"},
+	{`B"\U00000031"`, 2, 4, "invalid escape sequence: \\U is not allowed in bytes literal"},
+	{`"\UFFFFFFFF"`, 1, 11, "invalid escape sequence: invalid code point: U+FFFFFFFF"},
+	{"``", 0, 2, "invalid empty identifier"},
+	{"1from", 1, 1, "number literal cannot follow identifier without any spaces"},
+	{`'''0`, 0, 4, "unclosed triple-quoted string literal"},
+	{`/*`, 0, 2, "unclosed comment"},
 }
 
 func testLexer(t *testing.T, source string, tokens []*Token) {
@@ -240,7 +241,10 @@ func TestLexerWrong(t *testing.T) {
 					t.Errorf("expected error message: %q, but: %q", tc.message, e.Message)
 				}
 				if e.Position.Pos != tc.pos {
-					t.Errorf("expected error position: %v, but: %v", tc.pos, e.Position.Pos)
+					t.Errorf("expected error position (pos): %v, but: %v", tc.pos, e.Position.Pos)
+				}
+				if e.Position.End != tc.end {
+					t.Errorf("expected error position (end): %v, but: %v", tc.end, e.Position.End)
 				}
 			} else {
 				t.Errorf("unexpected error: %v", err)

Original file line number	Diff line number	Diff line change
`@@ -418,7 +418,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s`
`418`	`418`	`for l.peekOk(i) {`
`419`	`419`	`if l.slice(i, i+len(q)) == q {`
`420`	`420`	`if len(content) == 0 && name == "identifier" {`
`421`		`- l.panicf("invalid empty identifier")`
	`421`	`+ l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+len(q)), "invalid empty identifier")`
`422`	`422`	`}`
`423`	`423`	`l.skipN(i + len(q))`
`424`	`424`	`return string(content)`
`@@ -428,7 +428,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s`
`428`	`428`	`if c == '\\' {`
`429`	`429`	`i++`
`430`	`430`	`if !l.peekOk(i) {`
`431`		`- l.panicf("invalid escape sequence: \\<eof>")`
	`431`	`+ l.panicfAtPosition(token.Pos(l.pos+i-1), token.Pos(l.pos+i), "invalid escape sequence: \\<eof>")`
`432`	`432`	`}`
`433`	`433`
`434`	`434`	`c := l.peek(i)`
`@@ -457,65 +457,69 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s`
`457`	`457`	case '\\', '?', '"', '\'', '`':
`458`	`458`	`content = append(content, c)`
`459`	`459`	`case 'x', 'X':`
`460`		`- if !(l.peekOk(i+1) && char.IsHexDigit(l.peek(i)) && char.IsHexDigit(l.peek(i+1))) {`
`461`		`- l.panicf("invalid escape sequence: hex escape sequence must be follwed by 2 hex digits")`
	`460`	`+ for j := 0; j < 2; j++ {`
	`461`	`+ if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) {`
	`462`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits")`
	`463`	`+ }`
`462`	`464`	`}`
`463`	`465`	`u, err := strconv.ParseUint(l.slice(i, i+2), 16, 8)`
`464`	`466`	`if err != nil {`
`465`		`- l.panicf("invalid escape sequence: %v", err)`
	`467`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err)`
`466`	`468`	`}`
`467`	`469`	`content = append(content, byte(u))`
`468`	`470`	`i += 2`
`469`	`471`	`case 'u', 'U':`
`470`	`472`	`if !unicode {`
`471`		`- l.panicf("invalid escape sequence: \\%c is not allowed in %s", c, name)`
	`473`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c is not allowed in %s", c, name)`
`472`	`474`	`}`
`473`	`475`	`size := 4`
`474`	`476`	`if c == 'U' {`
`475`	`477`	`size = 8`
`476`	`478`	`}`
`477`	`479`	`for j := 0; j < size; j++ {`
`478`	`480`	`if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) {`
`479`		`- l.panicf("invalid escape sequence: \\%c must be followed by %d hex digits", c, size)`
	`481`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: \\%c must be followed by %d hex digits", c, size)`
`480`	`482`	`}`
`481`	`483`	`}`
`482`	`484`	`u, err := strconv.ParseUint(l.slice(i, i+size), 16, 32)`
`483`	`485`	`if err != nil {`
`484`		`- l.panicf("invalid escape sequence: %v", err)`
	`486`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: %v", err)`
`485`	`487`	`}`
`486`	`488`	`if 0xD800 <= u && u <= 0xDFFF \|\| 0x10FFFF < u {`
`487`		`- l.panicf("invalid escape sequence: invalid code point: U+%04X", u)`
	`489`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: invalid code point: U+%04X", u)`
`488`	`490`	`}`
`489`	`491`	`var buf [utf8.MaxRune]byte`
`490`	`492`	`n := utf8.EncodeRune(buf[:], rune(u))`
`491`	`493`	`content = append(content, buf[:n]...)`
`492`	`494`	`i += size`
`493`	`495`	`case '0', '1', '2', '3':`
`494`		`- if !(l.peekOk(i+1) && char.IsOctalDigit(l.peek(i)) && char.IsOctalDigit(l.peek(i+1))) {`
`495`		`- l.panicf("invalid escape sequence: octal escape sequence must be follwed by 3 octal digits")`
	`496`	`+ for j := 0; j < 2; j++ {`
	`497`	`+ if !(l.peekOk(i+j) && char.IsOctalDigit(l.peek(i+j))) {`
	`498`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits")`
	`499`	`+ }`
`496`	`500`	`}`
`497`	`501`	`u, err := strconv.ParseUint(l.slice(i-1, i+2), 8, 8)`
`498`	`502`	`if err != nil {`
`499`		`- l.panicf("invalid escape sequence: %v", err)`
	`503`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err)`
`500`	`504`	`}`
`501`	`505`	`content = append(content, byte(u))`
`502`	`506`	`i += 2`
`503`	`507`	`default:`
`504`		`- l.panicf("invalid escape sequence: \\%c", c)`
	`508`	`+ l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c", c)`
`505`	`509`	`}`
`506`	`510`
`507`	`511`	`continue`
`508`	`512`	`}`
`509`	`513`
`510`	`514`	`if c == '\n' && len(q) != 3 {`
`511`		`- l.panicf("unclosed %s: newline appears in non triple-quoted", name)`
	`515`	`+ l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+1), "unclosed %s: newline appears in non triple-quoted", name)`
`512`	`516`	`}`
`513`	`517`
`514`	`518`	`content = append(content, c)`
`515`	`519`	`i++`
`516`	`520`	`}`
`517`	`521`
`518`		`- panic(l.errorf("unclosed %s", name))`
	`522`	`+ panic(l.errorfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i), "unclosed %s", name))`
`519`	`523`	`}`
`520`	`524`
`521`	`525`	`func (l *Lexer) skipSpaces() {`
`@@ -543,6 +547,7 @@ func (l *Lexer) skipComment() {`
`543`	`547`	`}`
`544`	`548`
`545`	`549`	`func (l *Lexer) skipCommentUntil(end string, mustEnd bool) {`
	`550`	`+ pos := token.Pos(l.pos)`
`546`	`551`	`for !l.eof() {`
`547`	`552`	`if l.slice(0, len(end)) == end {`
`548`	`553`	`l.skipN(len(end))`
`@@ -551,8 +556,7 @@ func (l *Lexer) skipCommentUntil(end string, mustEnd bool) {`
`551`	`556`	`l.skip()`
`552`	`557`	`}`
`553`	`558`	`if mustEnd {`
`554`		`- // TODO: improve error position`
`555`		`- l.panicf("unclosed comment")`
	`559`	`+ l.panicfAtPosition(pos, token.Pos(l.pos), "unclosed comment")`
`556`	`560`	`}`
`557`	`561`	`}`
`558`	`562`
`@@ -596,6 +600,17 @@ func (l Lexer) errorf(msg string, param ...interface{}) Error {`
`596`	`600`	`}`
`597`	`601`	`}`
`598`	`602`
	`603`	`+func (l Lexer) errorfAtPosition(pos, end token.Pos, msg string, param ...interface{}) Error {`
	`604`	`+ return &Error{`
	`605`	`+ Message: fmt.Sprintf(msg, param...),`
	`606`	`+ Position: l.Position(pos, end),`
	`607`	`+ }`
	`608`	`+}`
	`609`	`+`
`599`	`610`	`func (l *Lexer) panicf(msg string, param ...interface{}) {`
`600`	`611`	`panic(l.errorf(msg, param...))`
`601`	`612`	`}`
	`613`	`+`
	`614`	`+func (l *Lexer) panicfAtPosition(pos, end token.Pos, msg string, param ...interface{}) {`
	`615`	`+ panic(l.errorfAtPosition(pos, end, msg, param...))`
	`616`	`+}`