now lexing keyowrds, operator `::` and operator `->`

also implemented `operator <<` for `dumb::Lexer::TokenStream`
development
Michael Ochmann 3 years ago
parent 8366eadb62
commit a1391708b2
  1. 203
      src/Lexer.cpp
  2. 27
      src/Lexer.hpp
  3. 12
      tests/test.dmb

@ -1,28 +1,81 @@
#include <fstream>
#include <iostream>
#include <numeric>
#include "Lexer.hpp"
namespace dumb {
const std::vector<std::string> Token::ReverseType = {
"KEYWORD",
"IDENTIFIER",
"OPERATOR_PLUS",
"OPERATOR_MINUS",
"OPERATOR_ASTERISK",
"OPERATOR_SLASH",
"OPERATOR_EQUALS",
"OPERATOR_DOT",
"OPERATOR_COLON",
"OPERATOR_ARROW",
"OPERATOR_DOUBLE_COLON",
"INTEGER_LITERAL",
"FLOAT_LITERAL",
"STRING_LITERAL",
"SEMICOLON",
"COMMA",
"OPEN_PAREN",
"CLOSE_PAREN",
"OPEN_BRACE",
"CLOSE_BRACE",
"OPEN_BRACKET",
"CLOSE_BRACKET"
};
const std::vector<std::string> Lexer::Keywords = {
"int",
"float",
"string"
"u8",
"u16",
"u32",
"u64",
"s8",
"s16",
"s32",
"s64",
"string",
"while",
"for",
"if",
"else",
"this",
"namespace"
};
dumb::Lexer::Lexer(std::filesystem::path sourceFile) : line(1), column(1), pointer(0) {
const std::unordered_map<std::string, Token::Type> Lexer::Operators = {
{"+", Token::Type::OPERATOR_PLUS},
{"-", Token::Type::OPERATOR_MINUS},
{"/", Token::Type::OPERATOR_SLASH},
{"*", Token::Type::OPERATOR_ASTERISK},
{"=", Token::Type::OPERATOR_EQUALS},
{":", Token::Type::OPERATOR_COLON},
{".", Token::Type::OPERATOR_DOT}
};
dumb::Lexer::Lexer(Compiler& compiler, std::filesystem::path sourceFile) : compiler(compiler), line(1), column(1), pointer(0) {
std::ifstream file(sourceFile);
std::stringstream buffer;
buffer << file.rdbuf();
this->sourceCode = buffer.str();
std::cout << this->sourceCode << std::endl;
}
void Lexer::tokenize() {
const Lexer::TokenStream& Lexer::tokenize() {
while (this->current() != Lexer::EoF) {
if (this->current() == Lexer::EOL) {
this->consume();
@ -40,6 +93,10 @@ namespace dumb {
char last = this->current();
while (true) {
const char& cur = this->consume();
if (cur == Lexer::EOL) {
this->line++;
this->column = 1;
}
if (cur == '/' && last == '*' || cur == Lexer::EoF)
break;
last = cur;
@ -56,25 +113,54 @@ namespace dumb {
} else if (std::isalpha(this->current())) {
this->readIdentifier();
continue;
} else if (this->current() == '0' && (this->next() == 'x' || this->next() == 'b') && std::isalnum(this->peek(2))) {
this->readBaseNumber();
continue;
} else if (std::isdigit(this->current()) || this->current() == '.') {
this->readNumber();
continue;
} else if (Lexer::IsOperator(this->current())) {
this->readOperator();
continue;
} else if (this->current() == '"') {
this->readStringLiteral();
continue;
} else if (std::isdigit(this->current()) || this->current() == '.') {
this->readNumber();
continue;
} else {
std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl;
exit(1);
Token::Type type;
switch (this->current()) {
case '{':
type = Token::Type::OPEN_BRACE;
break;
case '}':
type = Token::Type::CLOSE_BRACE;
break;
case '(':
type = Token::Type::OPEN_PAREN;
break;
case ')':
type = Token::Type::CLOSE_PAREN;
break;
case '[':
type = Token::Type::OPEN_BRACKET;
break;
case ']':
type = Token::Type::CLOSE_BRACKET;
break;
default:
std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl;
exit(1);
}
std::string value = std::string(1, this->current());
this->tokenStream.emplace_back(type, value, this->line, this->column);
this->consume();
}
}
for (auto& token : this->tokenStream) {
std::cout << "type: " << static_cast<int>(token.type) << ", data: " << token.data << ", line: " << token.line << ", column: " << token.column << std::endl;
}
std::cout << this->tokenStream << std::endl;
return this->tokenStream;
}
void Lexer::readIdentifier() {
@ -90,12 +176,33 @@ namespace dumb {
Token::Type type = Lexer::IsKeyword(data) ? Token::Type::KEYWORD : Token::Type::IDENTIFIER;
this->tokenStream.emplace_back(type, data, line, column);
this->column += data.length();
this->resetBuffer();
}
void Lexer::readOperator() {
this->tokenStream.emplace_back(Token::Type::OPERATOR, std::string(1, this->current()), this->line, this->column);
Token::Type type;
if (this->current() == '-' && this->next() == '>') {
type = Token::Type::OPERATOR_ARROW;
this->consume();
this->consume();
}
else if (this->current() == ':' && this->next() == ':') {
type = Token::Type::OPERATOR_DOUBLE_COLON;
this->consume();
this->consume();
}
else {
if (Lexer::Operators.find(std::string(1, this->current())) == Lexer::Operators.end()) {
std::cout << "unknown operator '" << this->current() << "' on line " << this->line << " column " << this->column << "." << std::endl;
exit(1);
}
type = Lexer::Operators.at(std::string(1, this->current()));
this->consume();
}
this->tokenStream.emplace_back(type, "", this->line, this->column);
this->consume();
}
@ -126,7 +233,7 @@ namespace dumb {
std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl;
exit(1);
}
floatingPoint = cur == '.';
floatingPoint = floatingPoint || cur == '.';
this->buffer << cur;
}
Token::Type type = floatingPoint ? Token::Type::FLOAT_LITERAL : Token::Type::INTEGER_LITERAL;
@ -136,4 +243,66 @@ namespace dumb {
this->resetBuffer();
}
}
void Lexer::readBaseNumber() {
size_t line = this->line;
size_t column = this->column;
this->consume();
const char& kind = this->consume();
size_t value;
switch (kind) {
case 'x':
while (std::isdigit(this->current()) || (this->current() >= 'A' && this->current() <= 'F') || (this->current() >= 'a' && this->current() <= 'f'))
this->buffer << this->consume();
value = std::stoi(this->buffer.str(), nullptr, 16);
break;
case 'b':
while (this->current() == '0' || this->current() == '1')
this->buffer << this->consume();
value = std::stoi(this->buffer.str(), nullptr, 2);
break;
default:
std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl;
exit(1);
}
this->tokenStream.emplace_back(Token::Type::INTEGER_LITERAL, std::to_string(value), line, column);
this->resetBuffer();
}
}
std::ostream& operator <<(std::ostream& os, const dumb::Lexer::TokenStream& tokenStream) {
size_t maxLength = 0;
size_t maxDataLength = 0;
size_t maxLine = 0;
for (auto& string : dumb::Token::ReverseType) {
if (string.length() > maxLength)
maxLength = string.length();
}
maxLength += 4;
for (auto& token : tokenStream) {
if (token.data.length() > maxDataLength)
maxDataLength = token.data.length();
if (token.line > maxLine)
maxLine = token.line;
}
maxDataLength += 6;
maxLine = std::to_string(maxLine).length();
for (auto& token : tokenStream) {
const std::string& type = dumb::Token::ReverseType.at(static_cast<int>(token.type));
std::string data = token.data.empty() ? "" : "<" + token.data + ">";
os << std::left << std::setfill(' ') << std::setw(maxLength);
os << type << std::setw(maxDataLength);
os << data << std::setw(maxLine) << std::right;
os << token.line << ":" << std::left << std::setw(0) << token.column << '\n';
}
return os;
}

@ -4,14 +4,24 @@
#include <string>
#include <sstream>
#include <vector>
#include <unordered_map>
namespace dumb {
class Compiler;
struct Token {
enum class Type {
KEYWORD,
IDENTIFIER,
OPERATOR,
OPERATOR_PLUS,
OPERATOR_MINUS,
OPERATOR_ASTERISK,
OPERATOR_SLASH,
OPERATOR_EQUALS,
OPERATOR_DOT,
OPERATOR_COLON,
OPERATOR_ARROW,
OPERATOR_DOUBLE_COLON,
INTEGER_LITERAL,
FLOAT_LITERAL,
STRING_LITERAL,
@ -25,6 +35,8 @@ namespace dumb {
CLOSE_BRACKET
};
static const std::vector<std::string> ReverseType;
Type type;
std::string data;
size_t line;
@ -34,15 +46,19 @@ namespace dumb {
};
class Lexer {
public:
typedef std::vector<Token> TokenStream;
private:
Compiler& compiler;
std::string sourceCode;
std::vector<Token> tokenStream;
TokenStream tokenStream;
std::stringstream buffer;
size_t line;
size_t column;
size_t pointer;
static const std::vector<std::string> Keywords;
static const std::unordered_map<std::string, Token::Type> Operators;
void resetBuffer() {
this->buffer.str("");
@ -95,13 +111,16 @@ namespace dumb {
void readOperator();
void readStringLiteral();
void readNumber();
void readBaseNumber();
public:
constexpr static char EoF = '\0';
constexpr static char EOL = '\n';
explicit Lexer(std::filesystem::path sourceFile);
explicit Lexer(Compiler& compiler, std::filesystem::path sourceFile);
void tokenize();
const TokenStream& tokenize();
};
}
std::ostream& operator <<(std::ostream& os, const dumb::Lexer::TokenStream& tokenStream);

@ -1,4 +1,4 @@
foo : int = 55;
foo : int = 55 + 2;
myFloat : float = 3.1415;
// this is a comment
@ -8,4 +8,12 @@ stupid block comment
that goes for ages
*/
bar : string = "lol"; // more comments
binary := 0b01110;
hex := 0xFF;
bar : string = "lol"; // more comments
main :: (args : int, argv : string[]) -> int {
return 0;
}
Loading…
Cancel
Save