|
|
|
@ -1,28 +1,81 @@ |
|
|
|
|
#include <fstream> |
|
|
|
|
#include <iostream> |
|
|
|
|
#include <numeric> |
|
|
|
|
|
|
|
|
|
#include "Lexer.hpp" |
|
|
|
|
|
|
|
|
|
namespace dumb { |
|
|
|
|
|
|
|
|
|
const std::vector<std::string> Token::ReverseType = { |
|
|
|
|
"KEYWORD", |
|
|
|
|
"IDENTIFIER", |
|
|
|
|
"OPERATOR_PLUS", |
|
|
|
|
"OPERATOR_MINUS", |
|
|
|
|
"OPERATOR_ASTERISK", |
|
|
|
|
"OPERATOR_SLASH", |
|
|
|
|
"OPERATOR_EQUALS", |
|
|
|
|
"OPERATOR_DOT", |
|
|
|
|
"OPERATOR_COLON", |
|
|
|
|
"OPERATOR_ARROW", |
|
|
|
|
"OPERATOR_DOUBLE_COLON", |
|
|
|
|
"INTEGER_LITERAL", |
|
|
|
|
"FLOAT_LITERAL", |
|
|
|
|
"STRING_LITERAL", |
|
|
|
|
"SEMICOLON", |
|
|
|
|
"COMMA", |
|
|
|
|
"OPEN_PAREN", |
|
|
|
|
"CLOSE_PAREN", |
|
|
|
|
"OPEN_BRACE", |
|
|
|
|
"CLOSE_BRACE", |
|
|
|
|
"OPEN_BRACKET", |
|
|
|
|
"CLOSE_BRACKET" |
|
|
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
const std::vector<std::string> Lexer::Keywords = { |
|
|
|
|
"int", |
|
|
|
|
"float", |
|
|
|
|
"string" |
|
|
|
|
"u8", |
|
|
|
|
"u16", |
|
|
|
|
"u32", |
|
|
|
|
"u64", |
|
|
|
|
"s8", |
|
|
|
|
"s16", |
|
|
|
|
"s32", |
|
|
|
|
"s64", |
|
|
|
|
|
|
|
|
|
"string", |
|
|
|
|
|
|
|
|
|
"while", |
|
|
|
|
"for", |
|
|
|
|
|
|
|
|
|
"if", |
|
|
|
|
"else", |
|
|
|
|
|
|
|
|
|
"this", |
|
|
|
|
"namespace" |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
dumb::Lexer::Lexer(std::filesystem::path sourceFile) : line(1), column(1), pointer(0) { |
|
|
|
|
const std::unordered_map<std::string, Token::Type> Lexer::Operators = { |
|
|
|
|
{"+", Token::Type::OPERATOR_PLUS}, |
|
|
|
|
{"-", Token::Type::OPERATOR_MINUS}, |
|
|
|
|
{"/", Token::Type::OPERATOR_SLASH}, |
|
|
|
|
{"*", Token::Type::OPERATOR_ASTERISK}, |
|
|
|
|
{"=", Token::Type::OPERATOR_EQUALS}, |
|
|
|
|
{":", Token::Type::OPERATOR_COLON}, |
|
|
|
|
{".", Token::Type::OPERATOR_DOT} |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
dumb::Lexer::Lexer(Compiler& compiler, std::filesystem::path sourceFile) : compiler(compiler), line(1), column(1), pointer(0) { |
|
|
|
|
std::ifstream file(sourceFile); |
|
|
|
|
std::stringstream buffer; |
|
|
|
|
|
|
|
|
|
buffer << file.rdbuf(); |
|
|
|
|
|
|
|
|
|
this->sourceCode = buffer.str(); |
|
|
|
|
|
|
|
|
|
std::cout << this->sourceCode << std::endl; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void Lexer::tokenize() { |
|
|
|
|
const Lexer::TokenStream& Lexer::tokenize() { |
|
|
|
|
while (this->current() != Lexer::EoF) { |
|
|
|
|
if (this->current() == Lexer::EOL) { |
|
|
|
|
this->consume(); |
|
|
|
@ -40,6 +93,10 @@ namespace dumb { |
|
|
|
|
char last = this->current(); |
|
|
|
|
while (true) { |
|
|
|
|
const char& cur = this->consume(); |
|
|
|
|
if (cur == Lexer::EOL) { |
|
|
|
|
this->line++; |
|
|
|
|
this->column = 1; |
|
|
|
|
} |
|
|
|
|
if (cur == '/' && last == '*' || cur == Lexer::EoF) |
|
|
|
|
break; |
|
|
|
|
last = cur; |
|
|
|
@ -56,25 +113,54 @@ namespace dumb { |
|
|
|
|
} else if (std::isalpha(this->current())) { |
|
|
|
|
this->readIdentifier(); |
|
|
|
|
continue; |
|
|
|
|
} else if (this->current() == '0' && (this->next() == 'x' || this->next() == 'b') && std::isalnum(this->peek(2))) { |
|
|
|
|
this->readBaseNumber(); |
|
|
|
|
continue; |
|
|
|
|
} else if (std::isdigit(this->current()) || this->current() == '.') { |
|
|
|
|
this->readNumber(); |
|
|
|
|
continue; |
|
|
|
|
} else if (Lexer::IsOperator(this->current())) { |
|
|
|
|
this->readOperator(); |
|
|
|
|
continue; |
|
|
|
|
} else if (this->current() == '"') { |
|
|
|
|
this->readStringLiteral(); |
|
|
|
|
continue; |
|
|
|
|
} else if (std::isdigit(this->current()) || this->current() == '.') { |
|
|
|
|
this->readNumber(); |
|
|
|
|
continue; |
|
|
|
|
} else { |
|
|
|
|
std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl; |
|
|
|
|
exit(1); |
|
|
|
|
Token::Type type; |
|
|
|
|
|
|
|
|
|
switch (this->current()) { |
|
|
|
|
case '{': |
|
|
|
|
type = Token::Type::OPEN_BRACE; |
|
|
|
|
break; |
|
|
|
|
case '}': |
|
|
|
|
type = Token::Type::CLOSE_BRACE; |
|
|
|
|
break; |
|
|
|
|
case '(': |
|
|
|
|
type = Token::Type::OPEN_PAREN; |
|
|
|
|
break; |
|
|
|
|
case ')': |
|
|
|
|
type = Token::Type::CLOSE_PAREN; |
|
|
|
|
break; |
|
|
|
|
case '[': |
|
|
|
|
type = Token::Type::OPEN_BRACKET; |
|
|
|
|
break; |
|
|
|
|
case ']': |
|
|
|
|
type = Token::Type::CLOSE_BRACKET; |
|
|
|
|
break; |
|
|
|
|
default: |
|
|
|
|
std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl; |
|
|
|
|
exit(1); |
|
|
|
|
} |
|
|
|
|
std::string value = std::string(1, this->current()); |
|
|
|
|
this->tokenStream.emplace_back(type, value, this->line, this->column); |
|
|
|
|
this->consume(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (auto& token : this->tokenStream) { |
|
|
|
|
std::cout << "type: " << static_cast<int>(token.type) << ", data: " << token.data << ", line: " << token.line << ", column: " << token.column << std::endl; |
|
|
|
|
} |
|
|
|
|
std::cout << this->tokenStream << std::endl; |
|
|
|
|
|
|
|
|
|
return this->tokenStream; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void Lexer::readIdentifier() { |
|
|
|
@ -90,12 +176,33 @@ namespace dumb { |
|
|
|
|
Token::Type type = Lexer::IsKeyword(data) ? Token::Type::KEYWORD : Token::Type::IDENTIFIER; |
|
|
|
|
|
|
|
|
|
this->tokenStream.emplace_back(type, data, line, column); |
|
|
|
|
this->column += data.length(); |
|
|
|
|
this->resetBuffer(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void Lexer::readOperator() { |
|
|
|
|
this->tokenStream.emplace_back(Token::Type::OPERATOR, std::string(1, this->current()), this->line, this->column); |
|
|
|
|
Token::Type type; |
|
|
|
|
|
|
|
|
|
if (this->current() == '-' && this->next() == '>') { |
|
|
|
|
type = Token::Type::OPERATOR_ARROW; |
|
|
|
|
this->consume(); |
|
|
|
|
this->consume(); |
|
|
|
|
} |
|
|
|
|
else if (this->current() == ':' && this->next() == ':') { |
|
|
|
|
type = Token::Type::OPERATOR_DOUBLE_COLON; |
|
|
|
|
this->consume(); |
|
|
|
|
this->consume(); |
|
|
|
|
} |
|
|
|
|
else { |
|
|
|
|
if (Lexer::Operators.find(std::string(1, this->current())) == Lexer::Operators.end()) { |
|
|
|
|
std::cout << "unknown operator '" << this->current() << "' on line " << this->line << " column " << this->column << "." << std::endl; |
|
|
|
|
exit(1); |
|
|
|
|
} |
|
|
|
|
type = Lexer::Operators.at(std::string(1, this->current())); |
|
|
|
|
this->consume(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
this->tokenStream.emplace_back(type, "", this->line, this->column); |
|
|
|
|
this->consume(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -126,7 +233,7 @@ namespace dumb { |
|
|
|
|
std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl; |
|
|
|
|
exit(1); |
|
|
|
|
} |
|
|
|
|
floatingPoint = cur == '.'; |
|
|
|
|
floatingPoint = floatingPoint || cur == '.'; |
|
|
|
|
this->buffer << cur; |
|
|
|
|
} |
|
|
|
|
Token::Type type = floatingPoint ? Token::Type::FLOAT_LITERAL : Token::Type::INTEGER_LITERAL; |
|
|
|
@ -136,4 +243,66 @@ namespace dumb { |
|
|
|
|
this->resetBuffer(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
void Lexer::readBaseNumber() { |
|
|
|
|
size_t line = this->line; |
|
|
|
|
size_t column = this->column; |
|
|
|
|
|
|
|
|
|
this->consume(); |
|
|
|
|
const char& kind = this->consume(); |
|
|
|
|
|
|
|
|
|
size_t value; |
|
|
|
|
|
|
|
|
|
switch (kind) { |
|
|
|
|
case 'x': |
|
|
|
|
while (std::isdigit(this->current()) || (this->current() >= 'A' && this->current() <= 'F') || (this->current() >= 'a' && this->current() <= 'f')) |
|
|
|
|
this->buffer << this->consume(); |
|
|
|
|
value = std::stoi(this->buffer.str(), nullptr, 16); |
|
|
|
|
break; |
|
|
|
|
case 'b': |
|
|
|
|
while (this->current() == '0' || this->current() == '1') |
|
|
|
|
this->buffer << this->consume(); |
|
|
|
|
value = std::stoi(this->buffer.str(), nullptr, 2); |
|
|
|
|
break; |
|
|
|
|
default: |
|
|
|
|
std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl; |
|
|
|
|
exit(1); |
|
|
|
|
} |
|
|
|
|
this->tokenStream.emplace_back(Token::Type::INTEGER_LITERAL, std::to_string(value), line, column); |
|
|
|
|
this->resetBuffer(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
std::ostream& operator <<(std::ostream& os, const dumb::Lexer::TokenStream& tokenStream) { |
|
|
|
|
size_t maxLength = 0; |
|
|
|
|
size_t maxDataLength = 0; |
|
|
|
|
size_t maxLine = 0; |
|
|
|
|
|
|
|
|
|
for (auto& string : dumb::Token::ReverseType) { |
|
|
|
|
if (string.length() > maxLength) |
|
|
|
|
maxLength = string.length(); |
|
|
|
|
} |
|
|
|
|
maxLength += 4; |
|
|
|
|
|
|
|
|
|
for (auto& token : tokenStream) { |
|
|
|
|
if (token.data.length() > maxDataLength) |
|
|
|
|
maxDataLength = token.data.length(); |
|
|
|
|
if (token.line > maxLine) |
|
|
|
|
maxLine = token.line; |
|
|
|
|
} |
|
|
|
|
maxDataLength += 6; |
|
|
|
|
maxLine = std::to_string(maxLine).length(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (auto& token : tokenStream) { |
|
|
|
|
const std::string& type = dumb::Token::ReverseType.at(static_cast<int>(token.type)); |
|
|
|
|
std::string data = token.data.empty() ? "" : "<" + token.data + ">"; |
|
|
|
|
|
|
|
|
|
os << std::left << std::setfill(' ') << std::setw(maxLength); |
|
|
|
|
os << type << std::setw(maxDataLength); |
|
|
|
|
os << data << std::setw(maxLine) << std::right; |
|
|
|
|
os << token.line << ":" << std::left << std::setw(0) << token.column << '\n'; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return os; |
|
|
|
|
} |
|
|
|
|