Merge pull request #33 from EdwardPalmer99/EdwardPalmer99/feature/cle…

…anup-tokenizer Rewrites Token
EdwardPalmer99 · Jan 14, 2025 · 08ea124 · 08ea124
2 parents bd5e8a4 + 744a9bc
commit 08ea124
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 104 deletions.
diff --git a/src/lexer/EucleiaTokenizer.cpp b/src/lexer/EucleiaTokenizer.cpp
@@ -21,34 +21,6 @@ Tokenizer Tokenizer::loadFromFile(const std::string &fpath)
 }
 
 
-std::string Token::description() const
-{
-    switch (type)
-    {
-        case None:
-            return "None";
-        case Punctuation:
-            return "Punctuation";
-        case Keyword:
-            return "Keyword";
-        case Variable:
-            return "Variable";
-        case String:
-            return "String";
-        case Operator:
-            return "Operator";
-        case Int:
-            return "Int";
-        case Float:
-            return "Float";
-        case Bool:
-            return "Bool";
-        default:
-            return "Unknown";
-    }
-}
-
-
 Tokenizer::Tokenizer(const std::string fileString)
     : InputStream(std::move(fileString))
 {
@@ -62,7 +34,7 @@ void Tokenizer::generateTokens()
     {
         auto token = buildNextToken();
 
-        if (token.type != Token::None)
+        if (token.type != Token::EndOfFile)
         {
             // std::cout << token << std::endl;
             _tokens.push(std::move(token));
@@ -92,11 +64,6 @@ Token Tokenizer::next()
     return next;
 }
 
-bool Tokenizer::isDataTypeToken()
-{
-    return Grammar::isDataType(peek().value);
-}
-
 
 Token Tokenizer::buildNextToken()
 {
@@ -134,7 +101,7 @@ Token Tokenizer::buildNextToken()
     }
     else if (isEof())
     {
-        return Token::blank();
+        return Token(Token::EndOfFile, "");
     }
     else
     {
@@ -258,7 +225,7 @@ Token Tokenizer::readID()
 
     std::string stringID(buffer.data());
 
-    return Token(isKeyword(stringID) ? Token::Keyword : Token::Variable, stringID);
+    return Token(Grammar::isKeyword(stringID) ? Token::Keyword : Token::Variable, stringID);
 }
 
 
@@ -285,11 +252,3 @@ Token Tokenizer::readOperator()
 
     return Token(Token::Operator, std::string(buffer.data()));
 }
-
-
-#pragma mark -
-
-bool Tokenizer::isKeyword(const std::string &possibleKeyword) const
-{
-    return Grammar::isKeyword(possibleKeyword);
-}
diff --git a/src/lexer/EucleiaTokenizer.hpp b/src/lexer/EucleiaTokenizer.hpp
@@ -5,61 +5,16 @@
 //  Created by Edward on 18/01/2024.
 //
 
-#ifndef EucleiaTokenizer_hpp
-#define EucleiaTokenizer_hpp
-
+#pragma once
 #include "EucleiaInputStream.hpp"
+#include "Token.hpp"
 #include <queue>
 #include <set>
 #include <string>
 
-// TODO: - bang in a namespace.
-
-struct Token
-{
-    enum TokenType
-    {
-        None,
-        Punctuation,
-        Keyword,
-        Variable,
-        String,
-        Operator,
-        Int,
-        Float,
-        Bool
-    };
-
-    Token(TokenType _type, std::string &&_value)
-        : type{_type}, value{_value}
-    {
-    }
-    Token(TokenType _type, std::string &_value)
-        : type{_type}, value{_value}
-    {
-    }
-
-    static Token blank()
-    {
-        return Token(None, "");
-    }
-
-    std::string description() const;
-
-    TokenType type;
-    std::string value;
-};
-
-
-inline std::ostream &operator<<(std::ostream &out, const Token &token)
-{
-    return (out << std::string("(" + token.description() + ", " + token.value + ")"));
-}
-
-
 class Tokenizer : public InputStream
 {
-  public:
+public:
     Tokenizer() = delete;
     Tokenizer(const std::string fileString);
     ~Tokenizer() = default;
@@ -75,9 +30,7 @@ class Tokenizer : public InputStream
         return _tokens.empty();
     }
 
-    bool isDataTypeToken();
-
-  protected:
+protected:
     void skipComment();
     void skipWhitespace();
 
@@ -88,14 +41,10 @@ class Tokenizer : public InputStream
     Token readOperator();
     Token readPunctuation();
 
-    bool isKeyword(const std::string &possibleKeyword) const;
-
     void generateTokens();
+
     Token buildNextToken();
 
-  private:
-    std::set<std::string> _allowedKeywords;
+private:
     std::queue<Token> _tokens;
 };
-
-#endif /* EucleiaTokenzier_hpp */
diff --git a/src/lexer/Token.cpp b/src/lexer/Token.cpp
@@ -0,0 +1,48 @@
+/**
+ * @file Token.cpp
+ * @author Edward Palmer
+ * @date 2025-01-14
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+
+#include "Token.hpp"
+#include "Exceptions.hpp"
+
+Token::Token(TokenType type_, std::string value_)
+    : type{type_}, value{std::move(value_)}
+{
+}
+
+
+std::string Token::print() const
+{
+    return typeName() + ": " + value;
+}
+
+
+std::string Token::typeName() const
+{
+    switch (type)
+    {
+        case EndOfFile:
+            return "end-of-file";
+        case Punctuation:
+            return "punctuation";
+        case Keyword:
+            return "keyword";
+        case Variable:
+            return "variable";
+        case String:
+            return "string";
+        case Operator:
+            return "other";
+        case Int:
+            return "int";
+        case Float:
+            return "float";
+        default:
+            ThrowException("unexpected token type: " + std::to_string(type));
+    }
+}
diff --git a/src/lexer/Token.hpp b/src/lexer/Token.hpp
@@ -0,0 +1,38 @@
+/**
+ * @file Token.hpp
+ * @author Edward Palmer
+ * @date 2025-01-14
+ *
+ * @copyright Copyright (c) 2025
+ *
+ */
+
+#pragma once
+#include <string>
+
+class Token
+{
+public:
+    enum TokenType
+    {
+        EndOfFile,
+        Punctuation,
+        Keyword,
+        Variable,
+        String,
+        Operator,
+        Int,
+        Float
+    };
+
+    Token() = delete;
+    Token(TokenType type, std::string value);
+
+    std::string print() const;
+
+    const TokenType type;
+    const std::string value;
+
+private:
+    std::string typeName() const;
+};
diff --git a/src/parser/EucleiaParser.cpp b/src/parser/EucleiaParser.cpp
@@ -8,6 +8,7 @@
 #include "EucleiaParser.hpp"
 #include "EucleiaModules.hpp"
 #include "Exceptions.hpp"
+#include "Grammar.hpp"
 #include "ObjectTypes.hpp"
 #include "TestModule.hpp"
 #include <assert.h>
@@ -46,7 +47,7 @@ FileNode *Parser::buildAST()
 {
     std::vector<BaseNode *> nodes;
 
-    while (!tokenizer.empty() && peekToken().type != Token::None)
+    while (!tokenizer.empty() && peekToken().type != Token::EndOfFile)
     {
         auto node = parseExpression();
 
@@ -954,7 +955,7 @@ bool Parser::isKeyword(const std::string &keyword)
 
 bool Parser::isDataTypeKeyword()
 {
-    return (tokenizer.isDataTypeToken());
+    return (Grammar::isDataType(peekToken().value));
 }
 
 
@@ -1015,5 +1016,5 @@ void Parser::unexpectedToken()
 {
     Token &token = peekToken();
 
-    ThrowException("unexpected token of type " + token.description() + " and value " + token.value);
+    ThrowException("unexpected token: " + token.print());
 }