From 1ead2d45e1154f197022fb28216a4b2d6480ca4d Mon Sep 17 00:00:00 2001 From: Maksim Shagov <43129418+MaksimShagov@users.noreply.github.com> Date: Fri, 10 May 2024 19:47:15 +0300 Subject: [PATCH] Support for-loop syntax in lexer and parser (#144) Supported constructs: * for i in range(stop) * for i in range(start, stop) * for i in range(start, stop, step) * for i, value in enumerate(list) * for value in list --- compiler/include/compiler/ast/node_type.hpp | 9 +- .../compiler/frontend/lexer/token_types.hpp | 1 - compiler/lib/ast/node.cpp | 9 ++ compiler/lib/frontend/lexer/lexer.cpp | 7 +- compiler/lib/frontend/lexer/token.cpp | 2 - compiler/lib/frontend/parser/parser.cpp | 37 ++++- compiler/tests/frontend/lexer.cpp | 27 +++- compiler/tests/frontend/parser.cpp | 130 ++++++++++++++++++ 8 files changed, 206 insertions(+), 16 deletions(-) diff --git a/compiler/include/compiler/ast/node_type.hpp b/compiler/include/compiler/ast/node_type.hpp index ea480220..bde3cf41 100644 --- a/compiler/include/compiler/ast/node_type.hpp +++ b/compiler/include/compiler/ast/node_type.hpp @@ -45,15 +45,20 @@ enum class NodeType { ElifStatement, ElseStatement, Expression, + FloatingPointLiteralValue, + ForIterable, + ForStatement, + ForTargets, FunctionArgument, FunctionArguments, FunctionCall, FunctionDefinition, FunctionName, FunctionReturnType, - FloatingPointLiteralValue, IfStatement, IntegerLiteralValue, + ListAccessor, + ListStatement, ProgramRoot, ReturnStatement, StringLiteralValue, @@ -63,8 +68,6 @@ enum class NodeType { VariableDeclaration, VariableName, WhileStatement, - ListStatement, - ListAccessor, }; } // namespace ast diff --git a/compiler/include/compiler/frontend/lexer/token_types.hpp b/compiler/include/compiler/frontend/lexer/token_types.hpp index 4a90ac24..9b9b7390 100644 --- a/compiler/include/compiler/frontend/lexer/token_types.hpp +++ b/compiler/include/compiler/frontend/lexer/token_types.hpp @@ -10,7 +10,6 @@ enum class Keyword { If, Else, Elif, - Range, While, For, Break, diff --git a/compiler/lib/ast/node.cpp b/compiler/lib/ast/node.cpp index f0641778..984d54bc 100644 --- a/compiler/lib/ast/node.cpp +++ b/compiler/lib/ast/node.cpp @@ -200,6 +200,15 @@ void Node::dump(std::ostream &stream, int depth) const { case NodeType::ListAccessor: stream << "ListAccessor\n"; break; + case NodeType::ForStatement: + stream << "ForStatement\n"; + break; + case NodeType::ForIterable: + stream << "ForIterable\n"; + break; + case NodeType::ForTargets: + stream << "ForTargets\n"; + break; default: stream << "Unknown\n"; } diff --git a/compiler/lib/frontend/lexer/lexer.cpp b/compiler/lib/frontend/lexer/lexer.cpp index b3168794..e367f4f8 100644 --- a/compiler/lib/frontend/lexer/lexer.cpp +++ b/compiler/lib/frontend/lexer/lexer.cpp @@ -11,22 +11,21 @@ using namespace lexer; using namespace utils; namespace { - +// clang-format off std::unordered_map keywords = { {"bool", Keyword::Bool}, {"False", Keyword::False}, {"int", Keyword::Int}, {"float", Keyword::Float}, {"str", Keyword::Str}, {"if", Keyword::If}, {"else", Keyword::Else}, {"elif", Keyword::Elif}, - {"range", Keyword::Range}, {"while", Keyword::While}, {"for", Keyword::For}, {"break", Keyword::Break}, {"import", Keyword::Import}, {"continue", Keyword::Continue}, {"def", Keyword::Definition}, {"return", Keyword::Return}, {"or", Keyword::Or}, {"and", Keyword::And}, {"not", Keyword::Not}, {"in", Keyword::In}, {"True", Keyword::True}, {"None", Keyword::None}, - {"list", Keyword::List}, + {"list", Keyword::List}, {"while", Keyword::While}, }; - +// clang-format on std::unordered_map operators = { {"%", Operator::Mod}, {".", Operator::Dot}, {"]", Operator::RectRightBrace}, {",", Operator::Comma}, {"=", Operator::Assign}, {"+", Operator::Add}, diff --git a/compiler/lib/frontend/lexer/token.cpp b/compiler/lib/frontend/lexer/token.cpp index 9da8fbf8..927b277c 100644 --- a/compiler/lib/frontend/lexer/token.cpp +++ b/compiler/lib/frontend/lexer/token.cpp @@ -46,8 +46,6 @@ const char *const keywordToString(Keyword kw) { return "not"; case Keyword::Or: return "or"; - case Keyword::Range: - return "range"; case Keyword::Return: return "return"; case Keyword::Str: diff --git a/compiler/lib/frontend/parser/parser.cpp b/compiler/lib/frontend/parser/parser.cpp index 2811e8be..a5530116 100644 --- a/compiler/lib/frontend/parser/parser.cpp +++ b/compiler/lib/frontend/parser/parser.cpp @@ -467,6 +467,8 @@ void parseBranchRoot(ParserContext &ctx) { ctx.node = ctx.pushChildNode(NodeType::IfStatement); } else if (currToken.is(Keyword::While)) { ctx.node = ctx.pushChildNode(NodeType::WhileStatement); + } else if (currToken.is(Keyword::For)) { + ctx.node = ctx.pushChildNode(NodeType::ForStatement); } else if (isVariableDeclaration(ctx.tokenIter, ctx.tokenEnd)) { ctx.node = ctx.pushChildNode(NodeType::VariableDeclaration); } else if (currToken.is(Keyword::Elif) || currToken.is(Keyword::Else)) { @@ -640,7 +642,6 @@ void parseVariableDeclaration(ParserContext &ctx) { auto node = ctx.pushChildNode(NodeType::TypeName); node->value = TypeRegistry::typeId(varType); bool isListType = varType.is(Keyword::List); - if (isListType) { const Token &leftBrace = (std::advance(ctx.tokenIter, 1), ctx.token()); const Token &varTypeList = (std::advance(ctx.tokenIter, 1), ctx.token()); @@ -687,12 +688,43 @@ void parseWhileStatement(ParserContext &ctx) { ctx.propagate(); } +void parseForStatement(ParserContext &ctx) { + assert(ctx.tokenIter->is(Keyword::For)); + ctx.goNextToken(); + auto forNode = ctx.node; + auto it = ctx.tokenIter; + auto forTargets = ParserContext::pushChildNode(forNode, NodeType::ForTargets, ctx.tokenIter->ref); + while (!it->is(Keyword::In) && !it->is(Special::EndOfExpression)) { + if (it->type == TokenType::Identifier) { + auto targetNode = ParserContext::pushChildNode(forTargets, NodeType::VariableName, ctx.tokenIter->ref); + targetNode->value = it->id(); + it++; + } else if (it->is(Operator::Comma)) { + it++; + } else { + ctx.pushError("Unexpected token in a for statement"); + } + } + ctx.tokenIter = it; + ctx.goNextToken(); + ctx.node = ctx.pushChildNode(NodeType::ForIterable); + ctx.node = ctx.pushChildNode(NodeType::Expression); + ctx.propagate(); + ctx.goParentNode(); + if (!ctx.token().is(Special::Colon)) { + ctx.pushError("Colon expected here"); + ctx.goNextExpression(); + } + ctx.node = ctx.pushChildNode(NodeType::BranchRoot); + ctx.nestingLevel++; + ctx.propagate(); +} + void parseListStatement(ParserContext &ctx) { assert(ctx.tokenIter->is(Operator::RectLeftBrace)); while (!ctx.token().is(Operator::RectRightBrace)) { ctx.goNextToken(); auto it = ctx.tokenIter; - while (!it->is(Operator::Comma) && !it->is(Operator::RectRightBrace)) it++; const auto &tokenIterBegin = ctx.tokenIter; @@ -727,6 +759,7 @@ static std::unordered_map> subpar SUBPARSER(VariableDeclaration), SUBPARSER(WhileStatement), SUBPARSER(ListStatement), + SUBPARSER(ForStatement), }; // clang-format on diff --git a/compiler/tests/frontend/lexer.cpp b/compiler/tests/frontend/lexer.cpp index de70e126..d3bf8c4d 100644 --- a/compiler/tests/frontend/lexer.cpp +++ b/compiler/tests/frontend/lexer.cpp @@ -31,10 +31,6 @@ TEST(Lexer, can_detect_else) { SINGLE_TOKEN_TEST_IMPL("else", Keyword::Else); } -TEST(Lexer, can_detect_range) { - SINGLE_TOKEN_TEST_IMPL("range", Keyword::Range); -} - TEST(Lexer, can_detect_for) { SINGLE_TOKEN_TEST_IMPL("for", Keyword::For); } @@ -502,3 +498,26 @@ TEST(Lexer, list_expression) { expected.emplace_back(Special::EndOfExpression); ASSERT_EQ(expected, transformed); } + +TEST(Lexer, for_range_and_enumerate_expression) { + StringVec source = {"for i in range(10)", "for i in enumerate(mylist)"}; + TokenList transformed = Lexer::process(source); + TokenList expected; + expected.emplace_back(Keyword::For); + expected.emplace_back(TokenType::Identifier, "i"); + expected.emplace_back(Keyword::In); + expected.emplace_back(TokenType::Identifier, "range"); + expected.emplace_back(Operator::LeftBrace); + expected.emplace_back(TokenType::IntegerLiteral, "10"); + expected.emplace_back(Operator::RightBrace); + expected.emplace_back(Special::EndOfExpression); + expected.emplace_back(Keyword::For); + expected.emplace_back(TokenType::Identifier, "i"); + expected.emplace_back(Keyword::In); + expected.emplace_back(TokenType::Identifier, "enumerate"); + expected.emplace_back(Operator::LeftBrace); + expected.emplace_back(TokenType::Identifier, "mylist"); + expected.emplace_back(Operator::RightBrace); + expected.emplace_back(Special::EndOfExpression); + ASSERT_EQ(expected, transformed); +} diff --git a/compiler/tests/frontend/parser.cpp b/compiler/tests/frontend/parser.cpp index 8c66301c..93b51198 100644 --- a/compiler/tests/frontend/parser.cpp +++ b/compiler/tests/frontend/parser.cpp @@ -1072,3 +1072,133 @@ TEST(Parser, can_parse_unary_in_if) { " IntegerLiteralValue: 1\n"; ASSERT_EQ(expected, tree.dump()); } + +TEST(Parser, can_parse_for_range) { + StringVec source = { + "def main() -> None:", " for i in range(0, 10, iter_step()):", + " x = 1", " for i in range(0, mylist[10]):", + " x = 1", " for i in range(1+2*3):", + " x = 1", + }; + TokenList tokens = Lexer::process(source); + SyntaxTree tree = Parser::process(tokens); + std::string expected = "ProgramRoot\n" + " FunctionDefinition\n" + " FunctionName: main\n" + " FunctionArguments\n" + " FunctionReturnType: NoneType\n" + " BranchRoot\n" + " ForStatement\n" + " ForTargets\n" + " VariableName: i\n" + " ForIterable\n" + " Expression\n" + " FunctionCall\n" + " FunctionName: range\n" + " FunctionArguments\n" + " Expression\n" + " IntegerLiteralValue: 0\n" + " Expression\n" + " IntegerLiteralValue: 10\n" + " Expression\n" + " FunctionCall\n" + " FunctionName: iter_step\n" + " BranchRoot\n" + " Expression\n" + " BinaryOperation: Assign\n" + " VariableName: x\n" + " IntegerLiteralValue: 1\n" + " ForStatement\n" + " ForTargets\n" + " VariableName: i\n" + " ForIterable\n" + " Expression\n" + " FunctionCall\n" + " FunctionName: range\n" + " FunctionArguments\n" + " Expression\n" + " IntegerLiteralValue: 0\n" + " Expression\n" + " ListAccessor\n" + " VariableName: mylist\n" + " Expression\n" + " IntegerLiteralValue: 10\n" + " BranchRoot\n" + " Expression\n" + " BinaryOperation: Assign\n" + " VariableName: x\n" + " IntegerLiteralValue: 1\n" + " ForStatement\n" + " ForTargets\n" + " VariableName: i\n" + " ForIterable\n" + " Expression\n" + " FunctionCall\n" + " FunctionName: range\n" + " FunctionArguments\n" + " Expression\n" + " BinaryOperation: Add\n" + " IntegerLiteralValue: 1\n" + " BinaryOperation: Mult\n" + " IntegerLiteralValue: 2\n" + " IntegerLiteralValue: 3\n" + " BranchRoot\n" + " Expression\n" + " BinaryOperation: Assign\n" + " VariableName: x\n" + " IntegerLiteralValue: 1\n"; + ASSERT_EQ(expected, tree.dump()); +} + +TEST(Parser, can_parse_for_enumerate) { + StringVec source = { + "def main() -> None:", " for i, value in enumerate(mylist):", " x = 1", " for elem in mylist:", + " x = 1", + }; + TokenList tokens = Lexer::process(source); + SyntaxTree tree = Parser::process(tokens); + std::string expected = "ProgramRoot\n" + " FunctionDefinition\n" + " FunctionName: main\n" + " FunctionArguments\n" + " FunctionReturnType: NoneType\n" + " BranchRoot\n" + " ForStatement\n" + " ForTargets\n" + " VariableName: i\n" + " VariableName: value\n" + " ForIterable\n" + " Expression\n" + " FunctionCall\n" + " FunctionName: enumerate\n" + " FunctionArguments\n" + " Expression\n" + " VariableName: mylist\n" + " BranchRoot\n" + " Expression\n" + " BinaryOperation: Assign\n" + " VariableName: x\n" + " IntegerLiteralValue: 1\n" + " ForStatement\n" + " ForTargets\n" + " VariableName: elem\n" + " ForIterable\n" + " Expression\n" + " VariableName: mylist\n" + " BranchRoot\n" + " Expression\n" + " BinaryOperation: Assign\n" + " VariableName: x\n" + " IntegerLiteralValue: 1\n"; + ASSERT_EQ(expected, tree.dump()); +} + +TEST(Parser, can_throw_error_when_for_loop_does_contain_colon) { + StringVec source = { + "def main() -> None:", + " for i in range(0, 10, 1)", + " x = 1", + }; + TokenList tokens = Lexer::process(source); + ASSERT_ANY_THROW(Parser::process(tokens)); +}