now lexing keyowrds, operator `::` and operator `->`

also implemented `operator <<` for `dumb::Lexer::TokenStream`
3 years ago · a1391708b2
parent 8366eadb62
commit a1391708b2
3 changed files with 219 additions and 23 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -1,28 +1,81 @@
 #include <fstream>
 #include <iostream>
+#include <numeric>

 #include "Lexer.hpp"

 namespace dumb {

+	const std::vector<std::string> Token::ReverseType = {
+		"KEYWORD",
+		"IDENTIFIER",
+		"OPERATOR_PLUS",
+		"OPERATOR_MINUS",
+		"OPERATOR_ASTERISK",
+		"OPERATOR_SLASH",
+		"OPERATOR_EQUALS",
+		"OPERATOR_DOT",
+		"OPERATOR_COLON",
+		"OPERATOR_ARROW",
+		"OPERATOR_DOUBLE_COLON",
+		"INTEGER_LITERAL",
+		"FLOAT_LITERAL",
+		"STRING_LITERAL",
+		"SEMICOLON",
+		"COMMA",
+		"OPEN_PAREN",
+		"CLOSE_PAREN",
+		"OPEN_BRACE",
+		"CLOSE_BRACE",
+		"OPEN_BRACKET",
+		"CLOSE_BRACKET"
+
+	};
+
 	const std::vector<std::string> Lexer::Keywords = {
 			"int",
 			"float",
-			"string"
+			"u8",
+			"u16",
+			"u32",
+			"u64",
+			"s8",
+			"s16",
+			"s32",
+			"s64",
+
+			"string",
+
+			"while",
+			"for",
+
+			"if",
+			"else",
+
+			"this",
+			"namespace"
 	};

-	dumb::Lexer::Lexer(std::filesystem::path sourceFile) : line(1), column(1), pointer(0) {
+	const std::unordered_map<std::string, Token::Type> Lexer::Operators = {
+			{"+", Token::Type::OPERATOR_PLUS},
+			{"-", Token::Type::OPERATOR_MINUS},
+			{"/", Token::Type::OPERATOR_SLASH},
+			{"*", Token::Type::OPERATOR_ASTERISK},
+			{"=", Token::Type::OPERATOR_EQUALS},
+			{":", Token::Type::OPERATOR_COLON},
+			{".", Token::Type::OPERATOR_DOT}
+	};
+
+	dumb::Lexer::Lexer(Compiler& compiler, std::filesystem::path sourceFile) : compiler(compiler), line(1), column(1), pointer(0) {
 		std::ifstream file(sourceFile);
 		std::stringstream buffer;

 		buffer << file.rdbuf();

 		this->sourceCode = buffer.str();
-
-		std::cout << this->sourceCode << std::endl;
 	}

-	void Lexer::tokenize() {
+	const Lexer::TokenStream& Lexer::tokenize() {
 		while (this->current() != Lexer::EoF) {
 			if (this->current() == Lexer::EOL) {
 				this->consume();
@ -40,6 +93,10 @@ namespace dumb {
 				char last = this->current();
 				while (true) {
 					const char& cur = this->consume();
+					if (cur == Lexer::EOL) {
+						this->line++;
+						this->column = 1;
+					}
 					if (cur == '/' && last == '*' || cur == Lexer::EoF)
 						break;
 					last = cur;
@ -56,25 +113,54 @@ namespace dumb {
 			} else if (std::isalpha(this->current())) {
 				this->readIdentifier();
 				continue;
+			} else if (this->current() == '0' && (this->next() == 'x' || this->next() == 'b') && std::isalnum(this->peek(2))) {
+				this->readBaseNumber();
+				continue;
+			} else if (std::isdigit(this->current()) || this->current() == '.') {
+				this->readNumber();
+				continue;
 			} else if (Lexer::IsOperator(this->current())) {
 				this->readOperator();
 				continue;
 			} else if (this->current() == '"') {
 				this->readStringLiteral();
 				continue;
-			} else if (std::isdigit(this->current()) || this->current() == '.') {
-				this->readNumber();
-				continue;
 			} else {
-				std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl;
-				exit(1);
+				Token::Type type;
+
+				switch (this->current()) {
+					case '{':
+						type = Token::Type::OPEN_BRACE;
+						break;
+					case '}':
+						type = Token::Type::CLOSE_BRACE;
+						break;
+					case '(':
+						type = Token::Type::OPEN_PAREN;
+						break;
+					case ')':
+						type = Token::Type::CLOSE_PAREN;
+						break;
+					case '[':
+						type = Token::Type::OPEN_BRACKET;
+						break;
+					case ']':
+						type = Token::Type::CLOSE_BRACKET;
+						break;
+					default:
+						std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl;
+						exit(1);
+				}
+				std::string value = std::string(1, this->current());
+				this->tokenStream.emplace_back(type, value, this->line, this->column);
+				this->consume();
 			}

 		}

-		for (auto& token : this->tokenStream) {
-			std::cout << "type: " << static_cast<int>(token.type) << ", data: " << token.data << ", line: " << token.line << ", column: " << token.column << std::endl;
-		}
+		std::cout << this->tokenStream << std::endl;
+
+		return this->tokenStream;
 	}

 	void Lexer::readIdentifier() {
@ -90,12 +176,33 @@ namespace dumb {
 		Token::Type type = Lexer::IsKeyword(data) ? Token::Type::KEYWORD : Token::Type::IDENTIFIER;

 		this->tokenStream.emplace_back(type, data, line, column);
-		this->column += data.length();
 		this->resetBuffer();
 	}

 	void Lexer::readOperator() {
-		this->tokenStream.emplace_back(Token::Type::OPERATOR, std::string(1, this->current()), this->line, this->column);
+		Token::Type type;
+
+		if (this->current() == '-' && this->next() == '>') {
+			type = Token::Type::OPERATOR_ARROW;
+			this->consume();
+			this->consume();
+		}
+		else if (this->current() == ':' && this->next() == ':') {
+			type = Token::Type::OPERATOR_DOUBLE_COLON;
+			this->consume();
+			this->consume();
+		}
+		else {
+			if (Lexer::Operators.find(std::string(1, this->current())) == Lexer::Operators.end()) {
+				std::cout << "unknown operator '" << this->current() << "' on line " << this->line << " column " << this->column << "." << std::endl;
+				exit(1);
+			}
+			type = Lexer::Operators.at(std::string(1, this->current()));
+			this->consume();
+		}
+
+
+		this->tokenStream.emplace_back(type, "", this->line, this->column);
 		this->consume();
 	}

@ -126,7 +233,7 @@ namespace dumb {
 				std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl;
 				exit(1);
 			}
-			floatingPoint = cur == '.';
+			floatingPoint = floatingPoint || cur == '.';
 			this->buffer << cur;
 		}
 		Token::Type type = floatingPoint ? Token::Type::FLOAT_LITERAL : Token::Type::INTEGER_LITERAL;
@ -136,4 +243,66 @@ namespace dumb {
 		this->resetBuffer();
 	}

-}
+	void Lexer::readBaseNumber() {
+		size_t line   = this->line;
+		size_t column = this->column;
+
+		this->consume();
+		const char& kind = this->consume();
+
+		size_t value;
+
+		switch (kind) {
+			case 'x':
+				while (std::isdigit(this->current()) || (this->current() >= 'A' && this->current() <= 'F') || (this->current() >= 'a' && this->current() <= 'f'))
+					this->buffer << this->consume();
+				value = std::stoi(this->buffer.str(), nullptr, 16);
+				break;
+			case 'b':
+				while (this->current() == '0' || this->current() == '1')
+					this->buffer << this->consume();
+				value = std::stoi(this->buffer.str(), nullptr, 2);
+				break;
+			default:
+				std::cout << "Unexpected token '" << this->current() << "' on line " << this->line << " column " << this->column << std::endl;
+				exit(1);
+		}
+		this->tokenStream.emplace_back(Token::Type::INTEGER_LITERAL, std::to_string(value), line, column);
+		this->resetBuffer();
+	}
+
+}
+
+std::ostream& operator <<(std::ostream& os, const dumb::Lexer::TokenStream& tokenStream) {
+	size_t maxLength     = 0;
+	size_t maxDataLength = 0;
+	size_t maxLine       = 0;
+
+	for (auto& string : dumb::Token::ReverseType) {
+		if (string.length() > maxLength)
+			maxLength = string.length();
+	}
+	maxLength += 4;
+
+	for (auto& token : tokenStream) {
+		if (token.data.length() > maxDataLength)
+			maxDataLength = token.data.length();
+		if (token.line > maxLine)
+			maxLine = token.line;
+	}
+	maxDataLength += 6;
+	maxLine = std::to_string(maxLine).length();
+
+
+	for (auto& token : tokenStream) {
+		const std::string& type = dumb::Token::ReverseType.at(static_cast<int>(token.type));
+		std::string data = token.data.empty() ? "" : "<" + token.data + ">";
+
+		os << std::left << std::setfill(' ') << std::setw(maxLength);
+		os << type << std::setw(maxDataLength);
+		os << data << std::setw(maxLine) << std::right;
+		os << token.line << ":" << std::left << std::setw(0) << token.column << '\n';
+	}
+
+	return os;
+}
--- a/src/Lexer.hpp
+++ b/src/Lexer.hpp
@ -4,14 +4,24 @@
 #include <string>
 #include <sstream>
 #include <vector>
+#include <unordered_map>

 namespace dumb {
+	class Compiler;

 	struct Token {
 		enum class Type {
 			KEYWORD,
 			IDENTIFIER,
-			OPERATOR,
+			OPERATOR_PLUS,
+			OPERATOR_MINUS,
+			OPERATOR_ASTERISK,
+			OPERATOR_SLASH,
+			OPERATOR_EQUALS,
+			OPERATOR_DOT,
+			OPERATOR_COLON,
+			OPERATOR_ARROW,
+			OPERATOR_DOUBLE_COLON,
 			INTEGER_LITERAL,
 			FLOAT_LITERAL,
 			STRING_LITERAL,
@ -25,6 +35,8 @@ namespace dumb {
 			CLOSE_BRACKET
 		};

+		static const std::vector<std::string> ReverseType;
+
 		Type type;
 		std::string data;
 		size_t line;
@ -34,15 +46,19 @@ namespace dumb {
 	};

 	class Lexer {
+		public:
+			typedef std::vector<Token> TokenStream;
 		private:
+			Compiler& compiler;
 			std::string sourceCode;
-			std::vector<Token> tokenStream;
+			TokenStream tokenStream;
 			std::stringstream buffer;
 			size_t line;
 			size_t column;
 			size_t pointer;

 			static const std::vector<std::string> Keywords;
+			static const std::unordered_map<std::string, Token::Type> Operators;

 			void resetBuffer() {
 				this->buffer.str("");
@ -95,13 +111,16 @@ namespace dumb {
 			void readOperator();
 			void readStringLiteral();
 			void readNumber();
+			void readBaseNumber();
 		public:
 			constexpr static char EoF = '\0';
 			constexpr static char EOL = '\n';
-			explicit Lexer(std::filesystem::path sourceFile);
+			explicit Lexer(Compiler& compiler, std::filesystem::path sourceFile);

-			void tokenize();
+			const TokenStream& tokenize();
 	};

 }

+std::ostream& operator <<(std::ostream& os, const dumb::Lexer::TokenStream& tokenStream);
+
--- a/tests/test.dmb
+++ b/tests/test.dmb
@ -1,4 +1,4 @@
-foo : int = 55;
+foo : int = 55 + 2;
 myFloat : float = 3.1415;

 // this is a comment
@ -8,4 +8,12 @@ stupid block comment
 that goes for ages
 */

-bar : string = "lol"; // more comments
+binary := 0b01110;
+hex := 0xFF;
+
+
+bar : string = "lol"; // more comments
+
+main :: (args : int, argv : string[]) -> int {
+    return 0;
+}