[lexer] lexer place holder

minamoto79 · minamoto79 · commit 93273d7b8b6c · 2017-06-21T07:39:50.000+03:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS
+        Support
+        )
+
+add_llvm_tool(postfix
+        postfix.cpp
+        Lexer.cpp
+        )
diff --git a/Lexer.cpp b/Lexer.cpp
@@ -0,0 +1,92 @@
+#include "Lexer.h"
+#include <map>
+
+using namespace lexer;
+
+static const std::map<char, lexer::Lexer::TokenType> tokens = {
+        {'(', Lexer::TokenType::TOK_LEFT_PARENTHESIS},
+        {')', Lexer::TokenType::TOK_RIGHT_PARENTHESIS},
+        {'[', Lexer::TokenType::TOK_LEFT_BRACKET},
+        {']', Lexer::TokenType::TOK_RIGHT_BRACKET},
+        {'{', Lexer::TokenType::TOK_LEFT_CURLY_BRACKET},
+        {'}', Lexer::TokenType::TOK_RIGHT_CURLY_BRACKET},
+};
+
+const Lexer::Token& Lexer::Lex() {
+    SkipWhiteSpace();
+    char *start = m_start;
+    m_last_token.m_type = InternalLex();
+    m_last_token.m_value = llvm::StringRef(start, m_start - start);
+    return m_last_token;
+}
+
+void Lexer::SkipWhiteSpace() {
+    while(*m_start == ' ' || *m_start == '\t')
+        m_start++;
+}
+
+Lexer::TokenType Lexer::InternalLex() {
+    if (m_start >= m_end)
+        return TOK_EOF;
+    /**
+     *  ascii
+     *     000 nul  001 soh  002 stx  003 etx  004 eot  005 enq  006 ack  007 bel
+     *     010 bs   011 ht   012 nl   013 vt   014 np   015 cr   016 so   017 si
+     *     020 dle  021 dc1  022 dc2  023 dc3  024 dc4  025 nak  026 syn  027 etb
+     *     030 can  031 em   032 sub  033 esc  034 fs   035 gs   036 rs   037 us
+     *     040 sp   041  !   042  "   043  #   044  $   045  %   046  &   047  '
+     *     050  (   051  )   052  *   053  +   054  ,   055  -   056  .   057  /
+     *     060  0   061  1   062  2   063  3   064  4   065  5   066  6   067  7
+     *     070  8   071  9   072  :   073  ;   074  <   075  =   076  >   077  ?
+     *     100  @   101  A   102  B   103  C   104  D   105  E   106  F   107  G
+     *     110  H   111  I   112  J   113  K   114  L   115  M   116  N   117  O
+     *     120  P   121  Q   122  R   123  S   124  T   125  U   126  V   127  W
+     *     130  X   131  Y   132  Z   133  [   134  \   135  ]   136  ^   137  _
+     *     140  `   141  a   142  b   143  c   144  d   145  e   146  f   147  g
+     *     150  h   151  i   152  j   153  k   154  l   155  m   156  n   157  o
+     *     160  p   161  q   162  r   163  s   164  t   165  u   166  v   167  w
+     *     170  x   171  y   172  z   173  {   174  |   175  }   176  ~   177 del
+     */
+    switch (*m_start) {
+        case '!':
+        case '#'...'&':
+        case '('...'/':
+        case ':'...'@':
+        case '['...'^':
+        case '{'...'~':
+        case '`':
+            return DoOperator();
+        case '"':
+        case '\'':
+            return DoStringLiteral();
+        case '0'...'9':
+            return DoNumberLiteral();
+        case '_':
+        case 'A'...'Z':
+        case 'a'...'z':
+            return DoIdentifier();
+        default:
+            return TOK_INVALID;
+    }
+}
+
+Lexer::TokenType Lexer::DoOperator() {
+    std::map<char, lexer::Lexer::TokenType>::const_iterator it = tokens.find(*m_start);
+    if (it != tokens.end()) {
+        m_start++;
+        return it->second;
+    }
+    return TOK_INVALID;
+}
+
+Lexer::TokenType Lexer::DoNumberLiteral() {
+    return TOK_INVALID;
+}
+
+Lexer::TokenType Lexer::DoIdentifier() {
+    return TOK_INVALID;
+}
+
+Lexer::TokenType Lexer::DoStringLiteral() {
+    return TOK_EOL;
+}
diff --git a/Lexer.h b/Lexer.h
@@ -0,0 +1,52 @@
+
+#ifndef LLVM_LEXER_H
+#define LLVM_LEXER_H
+
+#include <llvm/ADT/StringRef.h>
+
+namespace lexer {
+    class Lexer {
+    public:
+        Lexer(char *src): m_start(src), m_end(src + strlen(src)), m_last_token(TOK_INVALID, llvm::StringRef()){}
+
+        enum TokenType {
+            TOK_INVALID,
+            TOK_EOF,
+            TOK_LEFT_PARENTHESIS,
+            TOK_RIGHT_PARENTHESIS,
+            TOK_LEFT_BRACKET,
+            TOK_RIGHT_BRACKET,
+            TOK_LEFT_CURLY_BRACKET,
+            TOK_RIGHT_CURLY_BRACKET,
+            TOK_EOL
+        };
+
+        struct Token {
+            Token(TokenType type, llvm::StringRef value):m_type(type),m_value(value){}
+            TokenType m_type;
+            llvm::StringRef m_value;
+        };
+
+        const Token& Lex();
+
+    private:
+        char *m_start;
+        char *m_end;
+        Token m_last_token;
+
+        void SkipWhiteSpace();
+
+        TokenType InternalLex();
+
+        Lexer::TokenType DoOperator();
+
+        Lexer::TokenType DoNumberLiteral();
+
+        Lexer::TokenType DoIdentifier();
+
+        Lexer::TokenType DoStringLiteral();
+    };
+}
+
+
+#endif //LLVM_LEXER_H
diff --git a/postfix.cpp b/postfix.cpp
@@ -0,0 +1,49 @@
+#include <llvm/Support/raw_ostream.h>
+#include <iostream>
+#include "Lexer.h"
+
+using lexer::Lexer;
+
+int
+main() {
+    char c[1024];
+    std::string text;
+    llvm::raw_string_ostream ostream(text);
+    while (std::cin >> c) {
+        std::cout << ">>" << c;
+        ostream << c;
+    }
+    Lexer lex((char *) ostream.str().c_str());
+    while(true) {
+        const Lexer::Token& t = lex.Lex();
+        switch (t.m_type) {
+            case Lexer::TokenType::TOK_INVALID:
+                std::cout << "Unsupported token: " << t.m_value.str() << std::endl;
+                return 1;
+            case Lexer::TokenType::TOK_LEFT_PARENTHESIS:
+                std::cout << "( -> " << t.m_value.str() << std::endl;
+                break;
+            case Lexer::TokenType::TOK_RIGHT_PARENTHESIS:
+                std::cout << ") -> " << t.m_value.str() << std::endl;
+                break;
+            case Lexer::TokenType::TOK_LEFT_BRACKET:
+                std::cout << "[ -> " << t.m_value.str() << std::endl;
+                break;
+            case Lexer::TokenType::TOK_RIGHT_BRACKET:
+                std::cout << "] -> " << t.m_value.str() << std::endl;
+                break;
+            case Lexer::TokenType::TOK_LEFT_CURLY_BRACKET:
+                std::cout << "{ -> " << t.m_value.str() << std::endl;
+                break;
+            case Lexer::TokenType::TOK_RIGHT_CURLY_BRACKET:
+                std::cout << "} -> " << t.m_value.str() << std::endl;
+                break;
+            case Lexer::TokenType::TOK_EOF:
+                return 0;
+            default:
+                std::cout << "Unknown token: " << t.m_value.str() << std::endl;
+                return 1;
+
+        }
+    }
+}