diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 4613fef..3c364ff 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -1,28 +1,81 @@ #include #include +#include #include "Lexer.hpp" namespace dumb { + const std::vector Token::ReverseType = { + "KEYWORD", + "IDENTIFIER", + "OPERATOR_PLUS", + "OPERATOR_MINUS", + "OPERATOR_ASTERISK", + "OPERATOR_SLASH", + "OPERATOR_EQUALS", + "OPERATOR_DOT", + "OPERATOR_COLON", + "OPERATOR_ARROW", + "OPERATOR_DOUBLE_COLON", + "INTEGER_LITERAL", + "FLOAT_LITERAL", + "STRING_LITERAL", + "SEMICOLON", + "COMMA", + "OPEN_PAREN", + "CLOSE_PAREN", + "OPEN_BRACE", + "CLOSE_BRACE", + "OPEN_BRACKET", + "CLOSE_BRACKET" + + }; + const std::vector Lexer::Keywords = { "int", "float", - "string" + "u8", + "u16", + "u32", + "u64", + "s8", + "s16", + "s32", + "s64", + + "string", + + "while", + "for", + + "if", + "else", + + "this", + "namespace" }; - dumb::Lexer::Lexer(std::filesystem::path sourceFile) : line(1), column(1), pointer(0) { + const std::unordered_map Lexer::Operators = { + {"+", Token::Type::OPERATOR_PLUS}, + {"-", Token::Type::OPERATOR_MINUS}, + {"/", Token::Type::OPERATOR_SLASH}, + {"*", Token::Type::OPERATOR_ASTERISK}, + {"=", Token::Type::OPERATOR_EQUALS}, + {":", Token::Type::OPERATOR_COLON}, + {".", Token::Type::OPERATOR_DOT} + }; + + dumb::Lexer::Lexer(Compiler& compiler, std::filesystem::path sourceFile) : compiler(compiler), line(1), column(1), pointer(0) { std::ifstream file(sourceFile); std::stringstream buffer; buffer << file.rdbuf(); this->sourceCode = buffer.str(); - - std::cout << this->sourceCode << std::endl; } - void Lexer::tokenize() { + const Lexer::TokenStream& Lexer::tokenize() { while (this->current() != Lexer::EoF) { if (this->current() == Lexer::EOL) { this->consume(); @@ -40,6 +93,10 @@ namespace dumb { char last = this->current(); while (true) { const char& cur = this->consume(); + if (cur == Lexer::EOL) { + this->line++; + this->column = 1; + } if (cur == '/' && last == '*' || cur == Lexer::EoF) break; last = cur; @@ -56,25 +113,54 @@ namespace dumb { } else if (std::isalpha(this->current())) { this->readIdentifier(); continue; + } else if (this->current() == '0' && (this->next() == 'x' || this->next() == 'b') && std::isalnum(this->peek(2))) { + this->readBaseNumber(); + continue; + } else if (std::isdigit(this->current()) || this->current() == '.') { + this->readNumber(); + continue; } else if (Lexer::IsOperator(this->current())) { this->readOperator(); continue; } else if (this->current() == '"') { this->readStringLiteral(); continue; - } else if (std::isdigit(this->current()) || this->current() == '.') { - this->readNumber(); - continue; } else { - std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl; - exit(1); + Token::Type type; + + switch (this->current()) { + case '{': + type = Token::Type::OPEN_BRACE; + break; + case '}': + type = Token::Type::CLOSE_BRACE; + break; + case '(': + type = Token::Type::OPEN_PAREN; + break; + case ')': + type = Token::Type::CLOSE_PAREN; + break; + case '[': + type = Token::Type::OPEN_BRACKET; + break; + case ']': + type = Token::Type::CLOSE_BRACKET; + break; + default: + std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl; + exit(1); + } + std::string value = std::string(1, this->current()); + this->tokenStream.emplace_back(type, value, this->line, this->column); + this->consume(); } } - for (auto& token : this->tokenStream) { - std::cout << "type: " << static_cast(token.type) << ", data: " << token.data << ", line: " << token.line << ", column: " << token.column << std::endl; - } + std::cout << this->tokenStream << std::endl; + + return this->tokenStream; } void Lexer::readIdentifier() { @@ -90,12 +176,33 @@ namespace dumb { Token::Type type = Lexer::IsKeyword(data) ? Token::Type::KEYWORD : Token::Type::IDENTIFIER; this->tokenStream.emplace_back(type, data, line, column); - this->column += data.length(); this->resetBuffer(); } void Lexer::readOperator() { - this->tokenStream.emplace_back(Token::Type::OPERATOR, std::string(1, this->current()), this->line, this->column); + Token::Type type; + + if (this->current() == '-' && this->next() == '>') { + type = Token::Type::OPERATOR_ARROW; + this->consume(); + this->consume(); + } + else if (this->current() == ':' && this->next() == ':') { + type = Token::Type::OPERATOR_DOUBLE_COLON; + this->consume(); + this->consume(); + } + else { + if (Lexer::Operators.find(std::string(1, this->current())) == Lexer::Operators.end()) { + std::cout << "unknown operator '" << this->current() << "' on line " << this->line << " column " << this->column << "." << std::endl; + exit(1); + } + type = Lexer::Operators.at(std::string(1, this->current())); + this->consume(); + } + + + this->tokenStream.emplace_back(type, "", this->line, this->column); this->consume(); } @@ -126,7 +233,7 @@ namespace dumb { std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl; exit(1); } - floatingPoint = cur == '.'; + floatingPoint = floatingPoint || cur == '.'; this->buffer << cur; } Token::Type type = floatingPoint ? Token::Type::FLOAT_LITERAL : Token::Type::INTEGER_LITERAL; @@ -136,4 +243,66 @@ namespace dumb { this->resetBuffer(); } -} \ No newline at end of file + void Lexer::readBaseNumber() { + size_t line = this->line; + size_t column = this->column; + + this->consume(); + const char& kind = this->consume(); + + size_t value; + + switch (kind) { + case 'x': + while (std::isdigit(this->current()) || (this->current() >= 'A' && this->current() <= 'F') || (this->current() >= 'a' && this->current() <= 'f')) + this->buffer << this->consume(); + value = std::stoi(this->buffer.str(), nullptr, 16); + break; + case 'b': + while (this->current() == '0' || this->current() == '1') + this->buffer << this->consume(); + value = std::stoi(this->buffer.str(), nullptr, 2); + break; + default: + std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl; + exit(1); + } + this->tokenStream.emplace_back(Token::Type::INTEGER_LITERAL, std::to_string(value), line, column); + this->resetBuffer(); + } + +} + +std::ostream& operator <<(std::ostream& os, const dumb::Lexer::TokenStream& tokenStream) { + size_t maxLength = 0; + size_t maxDataLength = 0; + size_t maxLine = 0; + + for (auto& string : dumb::Token::ReverseType) { + if (string.length() > maxLength) + maxLength = string.length(); + } + maxLength += 4; + + for (auto& token : tokenStream) { + if (token.data.length() > maxDataLength) + maxDataLength = token.data.length(); + if (token.line > maxLine) + maxLine = token.line; + } + maxDataLength += 6; + maxLine = std::to_string(maxLine).length(); + + + for (auto& token : tokenStream) { + const std::string& type = dumb::Token::ReverseType.at(static_cast(token.type)); + std::string data = token.data.empty() ? "" : "<" + token.data + ">"; + + os << std::left << std::setfill(' ') << std::setw(maxLength); + os << type << std::setw(maxDataLength); + os << data << std::setw(maxLine) << std::right; + os << token.line << ":" << std::left << std::setw(0) << token.column << '\n'; + } + + return os; +} diff --git a/src/Lexer.hpp b/src/Lexer.hpp index cc1fba1..89dbc62 100644 --- a/src/Lexer.hpp +++ b/src/Lexer.hpp @@ -4,14 +4,24 @@ #include #include #include +#include namespace dumb { + class Compiler; struct Token { enum class Type { KEYWORD, IDENTIFIER, - OPERATOR, + OPERATOR_PLUS, + OPERATOR_MINUS, + OPERATOR_ASTERISK, + OPERATOR_SLASH, + OPERATOR_EQUALS, + OPERATOR_DOT, + OPERATOR_COLON, + OPERATOR_ARROW, + OPERATOR_DOUBLE_COLON, INTEGER_LITERAL, FLOAT_LITERAL, STRING_LITERAL, @@ -25,6 +35,8 @@ namespace dumb { CLOSE_BRACKET }; + static const std::vector ReverseType; + Type type; std::string data; size_t line; @@ -34,15 +46,19 @@ namespace dumb { }; class Lexer { + public: + typedef std::vector TokenStream; private: + Compiler& compiler; std::string sourceCode; - std::vector tokenStream; + TokenStream tokenStream; std::stringstream buffer; size_t line; size_t column; size_t pointer; static const std::vector Keywords; + static const std::unordered_map Operators; void resetBuffer() { this->buffer.str(""); @@ -95,13 +111,16 @@ namespace dumb { void readOperator(); void readStringLiteral(); void readNumber(); + void readBaseNumber(); public: constexpr static char EoF = '\0'; constexpr static char EOL = '\n'; - explicit Lexer(std::filesystem::path sourceFile); + explicit Lexer(Compiler& compiler, std::filesystem::path sourceFile); - void tokenize(); + const TokenStream& tokenize(); }; } +std::ostream& operator <<(std::ostream& os, const dumb::Lexer::TokenStream& tokenStream); + diff --git a/tests/test.dmb b/tests/test.dmb index 0f469f9..7b0b978 100644 --- a/tests/test.dmb +++ b/tests/test.dmb @@ -1,4 +1,4 @@ -foo : int = 55; +foo : int = 55 + 2; myFloat : float = 3.1415; // this is a comment @@ -8,4 +8,12 @@ stupid block comment that goes for ages */ -bar : string = "lol"; // more comments \ No newline at end of file +binary := 0b01110; +hex := 0xFF; + + +bar : string = "lol"; // more comments + +main :: (args : int, argv : string[]) -> int { + return 0; +} \ No newline at end of file