WIP

elliotchance · Dec 24, 2024 · ece6330 · ece6330
1 parent 336a4e0
commit ece6330
Show file tree

Hide file tree

Showing 2 changed files with 137 additions and 14 deletions.
diff --git a/grammar.y b/grammar.y
@@ -73,10 +73,10 @@ module vsql
 %token OPERATOR_ASTERISK OPERATOR_PLUS OPERATOR_COMMA OPERATOR_MINUS;
 %token OPERATOR_PERIOD OPERATOR_SOLIDUS OPERATOR_COLON OPERATOR_LESS_THAN;
 %token OPERATOR_GREATER_THAN OPERATOR_DOUBLE_PIPE OPERATOR_NOT_EQUALS;
-%token OPERATOR_GREATER_EQUALS OPERATOR_LESS_EQUALS;
+%token OPERATOR_GREATER_EQUALS OPERATOR_LESS_EQUALS OPERATOR_SEMICOLON;
 
 // literals
-%token LITERAL_IDENTIFIER LITERAL_STRING LITERAL_INTEGER;
+%token LITERAL_IDENTIFIER LITERAL_STRING LITERAL_NUMBER;
 
 %start preparable_statement;
 
@@ -345,7 +345,7 @@ signed_integer /* Value */ :
   | sign unsigned_integer   { log("signed_integer_2()") }
 
 unsigned_integer /* string */ :
-    LITERAL_INTEGER
+    LITERAL_NUMBER
 
 datetime_literal /* Value */ :
     date_literal
@@ -1641,7 +1641,7 @@ type YYSym = Value | ValueSpecification | ValueExpression | RowValueConstructor
   | CommonValueExpression | BooleanTerm | ValueExpressionPrimary
   | NumericPrimary | Term | BooleanTest | BooleanPrimary | BooleanPredicand
   | NonparenthesizedValueExpressionPrimary | SimpleTable | QueryExpression
-  | Stmt;
+  | Stmt | string;
 
 pub struct YYSymType {
 pub mut:
@@ -1653,12 +1653,6 @@ fn log(s string) {
   println(s)
 }
 
-pub struct Tok {
-pub:
-  token int
-  sym YYSymType
-}
-
 pub struct Lexer {
 pub mut:
   tokens []Tok
@@ -1680,11 +1674,11 @@ fn (mut l Lexer) error(s string) {
 }
 
 pub fn main_() {
+  // println(tokenize2("SELECT 'foo' FROM bar WHERE \"baz\" = 12.3"))
+  tokens := tokenize2("VALUES FALSE")
+
   mut lexer := Lexer{
-    tokens: [
-      Tok{token_values, YYSymType{} }
-      Tok{token_true, YYSymType{} }
-    ]
+    tokens: tokens
   }
 	mut parser := yy_new_parser()
   parser.parse(mut lexer)

diff --git a/vsql/lexer.v b/vsql/lexer.v
@@ -160,3 +160,132 @@ fn is_identifier_char(c rune, is_not_first bool) bool {
 
 	return yes
 }
+
+pub struct Tok {
+pub:
+  token int
+  sym YYSymType
+}
+
+fn tokenize2(sql_stmt string) []Tok {
+	mut tokens := []Tok{}
+	cs := sql_stmt.trim(';').runes()
+	mut i := 0
+
+	next: for i < cs.len {
+		// Numbers
+		if cs[i] >= `0` && cs[i] <= `9` {
+			mut word := ''
+			for i < cs.len && cs[i] >= `0` && cs[i] <= `9` {
+				word += '${cs[i]}'
+				i++
+			}
+			tokens << Tok{token_literal_number, YYSymType{value: word}}
+
+			// There is a special case for approximate numbers where 'E' is considered
+			// a separate token in the SQL BNF. However, "e2" should not be treated as
+			// two tokens, but rather we need to catch this case only when with a
+			// number token.
+			if i < cs.len && (cs[i] == `e` || cs[i] == `E`) {
+				tokens << Tok{token_e, YYSymType{}}
+				i++
+			}
+
+			continue
+		}
+
+		// Strings
+		if cs[i] == `'` {
+			mut word := ''
+			i++
+			for i < cs.len && cs[i] != `'` {
+				word += '${cs[i]}'
+				i++
+			}
+			i++
+			tokens << Tok{token_literal_string, YYSymType{value: word}}
+			continue
+		}
+
+		// Delimited identifiers
+		if cs[i] == `"` {
+			mut word := ''
+			i++
+			for i < cs.len && cs[i] != `"` {
+				word += '${cs[i]}'
+				i++
+			}
+			i++
+			tokens << Tok{token_literal_identifier, YYSymType{value: '"${word}"'}}
+			continue
+		}
+
+		// Operators
+		multi := {
+			'<>': token_operator_not_equals
+			'>=': token_operator_greater_equals
+			'<=': token_operator_less_equals
+			'||': token_operator_double_pipe
+		}
+		for op, tk in multi {
+			if cs[i] == op[0] && cs[i + 1] == op[1] {
+				tokens << Tok{tk, YYSymType{value: op}}
+				i += 2
+				continue next
+			}
+		}
+
+		single := {
+			`(`: token_operator_left_paren
+			`)`: token_operator_right_paren
+			`*`: token_operator_asterisk
+			`+`: token_operator_plus
+			`,`: token_operator_comma
+			`-`: token_operator_minus
+			`/`: token_operator_solidus
+			`;`: token_operator_semicolon
+			`<`: token_operator_less_than
+			`=`: token_operator_equals
+			`>`: token_operator_greater_than
+			`.`: token_operator_period
+			`:`: token_operator_colon
+		}
+		for op, tk in single {
+			if cs[i] == op {
+				tokens << Tok{tk, YYSymType{value: op.str()}}
+				i++
+				continue next
+			}
+		}
+
+		// Keyword or regular identifier
+		mut word := ''
+		mut is_not_first := false
+		for i < cs.len && is_identifier_char(cs[i], is_not_first) {
+			word += '${cs[i]}'
+			i++
+			is_not_first = true
+		}
+
+		if word == '' {
+			i++
+			continue
+		}
+
+		upper_word := word.to_upper()
+		mut found := false
+		for tok_pos, tok_name in yy_toknames {
+			if tok_name == upper_word {
+				tokens << Tok{tok_pos + 57343, YYSymType{value: upper_word}}
+				found = true
+				break
+			}
+		}
+
+		if !found {
+			Tok{token_literal_identifier, YYSymType{value: word}}
+		}
+	}
+
+	return tokens
+}