Skip to content

Commit ecc07ef

Browse files
committed
🐛 Add keyword support + fix number search
1 parent 7bd2965 commit ecc07ef

File tree

3 files changed

+48
-26
lines changed

3 files changed

+48
-26
lines changed

include/token.hpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef TOKEN_HPP
22
#define TOKEN_HPP
33

4+
#include <iostream>
45
#include <regex>
56
#include <string>
67
#include <vector>
@@ -9,6 +10,7 @@ using namespace std;
910

1011
enum TokenType {
1112
NONE,
13+
KEYWORD,
1214
IDENTIFICATOR,
1315
OPERATOR,
1416
SEPARATOR,
@@ -26,9 +28,10 @@ const string LITERAL_INCOMPLETE_PATTERN = R"(^\"([^\"\\\n]|\\.)*$)";
2628
const string COMMENT_LINE_PATTERN = R"(^//.*)";
2729
const string COMMENT_BLOCK_PATTERN = R"(/\*[\s\S]*?\*/)";
2830
const string COMMENT_BLOCK_INCOMPLETE_PATTERN = R"(/\*[\s\S]*)";
29-
const vector<string> OPERATORS = {"*", "/", "%", "+", "-", "==", "!=", "&&", "||", "=", "+=", "-=", "*=", "/=", "%=", "&", "|", ".", ">", "<", ">=", "<=", "!"};
31+
const vector<string> OPERATORS = {"*", "/", "%", "+", "-", "==", "!=", "&&", "||", "=", "+=", "-=", "*=", "/=", "%=", "&", "|", ".", ">", "<", ">=", "<=", "!", "++", "--"};
3032
const vector<string> SEPARATORS_IGNORED = {" ", "\n", "\t"};
3133
const vector<string> SEPARATORS_IMPORTANT = {"(", ")", "{", "}", "[", "]", ";", ",", "."};
34+
const vector<string> KEYWORDS = {"break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "float", "for", "goto", "if", "int", "long", "return", "short", "sizeof", "static", "struct", "switch", "typedef", "void", "while"};
3235

3336
class Token {
3437
public:
@@ -39,13 +42,40 @@ class Token {
3942

4043
Token() {};
4144
Token(string content, TokenType type, pair<int, int> position) : content(move(content)), type(type), position(position) {};
45+
46+
void print() {
47+
cout << "(" << position.first << "," << position.second << ") " << enum_type_string(type) << endl;
48+
cout << content << "\n\n";
49+
}
50+
51+
static string enum_type_string(TokenType type) {
52+
switch (type) {
53+
case NONE:
54+
return "NONE";
55+
case IDENTIFICATOR:
56+
return "IDENTIFICATOR";
57+
case NUMBER:
58+
return "NUMBER";
59+
case SEPARATOR:
60+
return "SEPARATOR";
61+
case OPERATOR:
62+
return "OPERATOR";
63+
case LITERAL:
64+
return "LITERAL";
65+
case COMMENT:
66+
return "COMMENT";
67+
case KEYWORD:
68+
return "KEYWORD";
69+
default:
70+
return "UNKNOWN";
71+
}
72+
}
4273
};
4374

4475
void lexical_error(Token token);
4576
vector<Token>
4677
tokenize(string& buffer);
4778
Token get_token(string::iterator& sentinel, string::iterator end, pair<int, int>& position);
4879
regex create_regex(vector<string> tokens);
49-
string enum_type_string(TokenType type);
5080

5181
#endif

src/main.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ int main(int argc, char* argv[]) {
2525
string buffer = get_content(argc, argv);
2626
vector<Token> tokens = tokenize(buffer);
2727
for (Token t : tokens) {
28-
cout << "(" << t.position.first << "," << t.position.second << ") " << enum_type_string(t.type) << endl;
29-
cout << t.content << "\n\n";
28+
t.print();
3029
}
3130
return 0;
3231
}

src/token.cpp

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ const regex COMMENT_BLOCK_INCOMPLETE_REGEX = regex(COMMENT_BLOCK_INCOMPLETE_PATT
1818
const regex OPERATORS_REGEX = create_regex(OPERATORS);
1919
const regex SEPARATORS_IGNORED_REGEX = create_regex(SEPARATORS_IGNORED);
2020
const regex SEPARATORS_IMPORTANT_REGEX = create_regex(SEPARATORS_IMPORTANT);
21+
const regex KEYWORDS_REGEX = create_regex(KEYWORDS);
2122

2223
string escape_regex(const string& str) {
2324
static const regex special_chars(R"([.^$|()\\[*+?{}])");
@@ -67,7 +68,7 @@ Token get_token(string::iterator& sentinel, string::iterator end, pair<int, int>
6768
break;
6869
} else if (regex_search(actual_char, OPERATORS_REGEX)) {
6970
token.type = OPERATOR;
70-
} else if (regex_search(actual_char, NUMERIC_INCOMPLETE_REGEX)) {
71+
} else if (regex_search(actual_char, NUMERIC_REGEX)) {
7172
token.type = NUMBER;
7273
} else if (regex_search(actual_char, LITERAL_INCOMPLETE_REGEX)) {
7374
token.type = LITERAL;
@@ -79,6 +80,18 @@ Token get_token(string::iterator& sentinel, string::iterator end, pair<int, int>
7980
}
8081
token.content = content_updated;
8182
lexical_error(token);
83+
} else if (regex_search(content_updated, KEYWORDS_REGEX)) {
84+
token.type = KEYWORD;
85+
}
86+
} else if (token.type == KEYWORD) {
87+
if (!regex_search(content_updated, KEYWORDS_REGEX)) {
88+
if (regex_search(content_updated, ALPHA_NUMERIC_REGEX)) {
89+
token.type = IDENTIFICATOR;
90+
} else if (regex_search(actual_char, SEPARATORS_IMPORTANT_REGEX) || regex_search(actual_char, OPERATORS_REGEX)) {
91+
break;
92+
}
93+
token.content = content_updated;
94+
lexical_error(token);
8295
}
8396
} else if (token.type == OPERATOR) {
8497
if (regex_search(content_updated, COMMENT_LINE_REGEX) || regex_search(content_updated, COMMENT_BLOCK_INCOMPLETE_REGEX)) {
@@ -87,6 +100,7 @@ Token get_token(string::iterator& sentinel, string::iterator end, pair<int, int>
87100
if (regex_search(actual_char, SEPARATORS_IMPORTANT_REGEX) || regex_search(actual_char, ALPHA_REGEX) || regex_search(actual_char, NUMERIC_REGEX) || regex_search(actual_char, LITERAL_INCOMPLETE_REGEX)) {
88101
break;
89102
}
103+
break;
90104
token.content = content_updated;
91105
lexical_error(token);
92106
}
@@ -141,27 +155,6 @@ vector<Token> tokenize(string& buffer) {
141155
return tokens;
142156
}
143157

144-
string enum_type_string(TokenType type) {
145-
switch (type) {
146-
case NONE:
147-
return "NONE";
148-
case IDENTIFICATOR:
149-
return "IDENTIFICATOR";
150-
case NUMBER:
151-
return "NUMBER";
152-
case SEPARATOR:
153-
return "SEPARATOR";
154-
case OPERATOR:
155-
return "OPERATOR";
156-
case LITERAL:
157-
return "LITERAL";
158-
case COMMENT:
159-
return "COMMENT";
160-
default:
161-
return "UNKNOWN";
162-
}
163-
}
164-
165158
void lexical_error(Token token) {
166159
cout << "Invalid token " << token.content << " at line " << token.position.second << " (" << token.position.first << ")\n";
167160
exit(1);

0 commit comments

Comments
 (0)