From 4eb24000e1704edcea960d190764e234441f9319 Mon Sep 17 00:00:00 2001 From: Michael Ochmann Date: Tue, 23 Aug 2022 15:18:31 +0200 Subject: [PATCH] better asserts; start of implementing strict/non strict mode --- src/Parser.php | 95 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 15 deletions(-) diff --git a/src/Parser.php b/src/Parser.php index 11ab17e..893e3b9 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -24,12 +24,14 @@ class Parser { private int $pointer; private DOMDocument $document; private array $references; + private bool $strict; - public function __construct(array $tokenStream) { + public function __construct(array $tokenStream, $strict = false) { $this->tokenStream = $tokenStream; $this->pointer = 0; $this->document = new DOMDocument(); $this->references = []; + $this->strict = $strict; } private function current() : Token { @@ -44,6 +46,11 @@ class Parser { return $this->peek(-1); } + private function insert(Token $token, int $offset = 0) : void { + $newElement = [$token]; + array_splice($this->tokenStream, $this->pointer + $offset, 0, $newElement); + } + private function peek(int $amount = 0) : Token { $amount += $this->pointer; if ($amount < 0 || $amount >= count($this->tokenStream)) @@ -63,6 +70,18 @@ class Parser { return stripslashes($text); } + private static function LOC(array $loc) : string { + [$col, $row, $fileName] = $loc; + + $file = $fileName ? $fileName : "INPUT_STRING"; + return "$file:$row:$col: ERROR: "; + return " in row $row, column $col of source string"; + } + + private static function Assert(bool $assertion, Token $token, string $message = "") : void { + assert($assertion, self::LOC($token->location).$message); + } + private function resolveReferences(DOMElement $node) : void { if (count($this->references) < 1) return; @@ -92,32 +111,66 @@ class Parser { private function parseBold() : DOMNode { $buffer = ""; - while ($this->current()->type !== TokenType::ASTERISK && $this->current()->type !== TokenType::EOL) { + while ($this->current()->type !== TokenType::ASTERISK) { + if ($this->current()->type === TokenType::EOL || $this->current()->type === TokenType::EOF) { + if (!$this->strict) { + [$col, $row, $fileName] = $this->current()->location; + $this->insert(new Token(TokenType::ASTERISK, "*", [$col + 1, $row, $fileName])); + $this->insert(new Token(TokenType::ASTERISK, "*", [$col + 2, $row, $fileName])); + } + break; + } + $buffer .= $this->consume()->data; } - $this->consume(); - $this->consume(); + if (!$this->strict && $this->current()->type !== TokenType::ASTERISK) + $this->insert(new Token(TokenType::ASTERISK, "*", $this->current()->location)); + $asterisk = $this->consume(); + self::Assert($asterisk->type === TokenType::ASTERISK, $asterisk, "expected asterisk, got ".$asterisk->type->name); + + if (!$this->strict && $this->current()->type !== TokenType::ASTERISK) + $this->insert(new Token(TokenType::ASTERISK, "*", $this->current()->location)); + $asterisk = $this->consume(); + self::Assert($asterisk->type === TokenType::ASTERISK, $asterisk, "expected asterisk, got ".$asterisk->type->name); return $this->document->createElement("b", $buffer); } private function parseItalic() : DOMNode { $buffer = ""; - while ($this->current()->type !== TokenType::ASTERISK && $this->current()->type !== TokenType::EOL) { + while ($this->current()->type !== TokenType::ASTERISK) { + if ($this->current()->type === TokenType::EOL || $this->current()->type === TokenType::EOF) { + if (!$this->strict) { + [$col, $row, $fileName] = $this->current()->location; + $this->insert(new Token(TokenType::ASTERISK, "*", [$col + 1, $row, $fileName])); + } + break; + } $buffer .= $this->consume()->data; } - $this->consume(); + $asterisk = $this->consume(); + self::Assert($asterisk->type === TokenType::ASTERISK, $asterisk, "expected asterisk, got ".$asterisk->type->name); return $this->document->createElement("i", $buffer); } private function parseCode() : DOMNode { $buffer = ""; - $this->consume(); - while ($this->current()->type !== TokenType::BACKTICK && $this->current()->type !== TokenType::EOL) + $backtick = $this->consume(); + self::Assert($backtick->type === TokenType::BACKTICK, $backtick, "expected backtick, got ".$backtick->type->name); + + while ($this->current()->type !== TokenType::BACKTICK) { + // we need to recover, if input is malformed + if ($this->current()->type === TokenType::EOL || $this->current()->type === TokenType::EOF) { + if (!$this->strict) + $this->insert(new Token(TokenType::BACKTICK, "`", $this->current()->location)); + break; + } $buffer .= $this->consume()->data; + } - $this->consume(); + $backtick = $this->consume(); + self::Assert($backtick->type === TokenType::BACKTICK, $backtick, "inline code expression not autmatically closed (expected backtick)"); return $this->document->createElement("code", $buffer); } @@ -127,11 +180,13 @@ class Parser { $consumption = 1; $lbracket = $this->consume(); + self::Assert($lbracket->type === TokenType::LBRACKET, $lbracket, "expected left bracket, got ".$lbracket->type->name); while (!($this->current()->type === TokenType::RBRACKET || $this->current()->type === TokenType::EOL)) { $text .= $this->consume()->data; $consumption++; } $rbracket = $this->consume(); + self::Assert($rbracket->type === TokenType::RBRACKET, $rbracket, "expected right bracket, got ".$rbracket->type->name); $consumption++; if ($this->current()->type !== TokenType::LBRACKET && $this->current()->type !== TokenType::LPAREN) { @@ -139,6 +194,7 @@ class Parser { return null; } $lbracketOrParen = $this->consume(); + self::Assert($lbracketOrParen->type === TokenType::LBRACKET || $lbracketOrParen->type === TokenType::LPAREN, $lbracketOrParen, "expected left bracket or left parenthesis, got ".$lbracketOrParen->type->name); $index = ""; while (!($this->current()->type === TokenType::RBRACKET || $this->current()->type === TokenType::RPAREN || $this->current()->type === TokenType::EOL)) @@ -203,11 +259,14 @@ class Parser { continue; } elseif ($this->current()->type === TokenType::BANG) { $bang = $this->consume(); + self::Assert($bang->type === TokenType::BANG, $bang, "expected exclamation mark, got ".$bang->type->name.", this may be a parser bug"); if ($this->current()->type !== TokenType::LBRACKET) { $buffer .= self::StripBackslashes($this->consume()->data); continue; } $lbracket = $this->consume(); + self::Assert($lbracket->type === TokenType::LBRACKET, $lbracket, "expected left bracket, got ".$lbracket->type->name); + $alt = ""; while ($this->current()->type !== TokenType::RBRACKET && $this->current()->type !== TokenType::EOL) $alt .= self::StripBackslashes($this->consume()->data); @@ -218,7 +277,10 @@ class Parser { continue; } $rbracket = $this->consume(); + self::Assert($rbracket->type === TokenType::RBRACKET, $rbracket, "expected right bracket, got ".$rbracket->type->name); $lparen = $this->consume(); + self::Assert($lparen->type === TokenType::LPAREN, $lparen, "expected left parenthesis, got ".$lparen->type->name); + $src = ""; while ($this->current()->type !== TokenType::RPAREN && $this->current()->type !== TokenType::EOL) $src .= $this->consume()->data; @@ -228,6 +290,8 @@ class Parser { continue; } $rparen = $this->consume(); + self::Assert($rparen->type === TokenType::RPAREN, $rparen, "expected right parenthesis, got ".$rparen->type->name); + $elm = $this->document->createElement("img"); if (strlen($alt) > 0) $elm->setAttribute("alt", $alt); @@ -235,9 +299,10 @@ class Parser { $clearBuffer(); array_push($elms, $elm); continue; + } elseif ($this->current()->type === TokenType::LBRACE) { $lbrace = $this->consume(); - assert($lbrace->type === TokenType::LBRACE, "expected left brace, got ".$lbrace->type->name); + self::Assert($lbrace->type === TokenType::LBRACE, $lbrace, "expected left brace, got ".$lbrace->type->name); $content = ""; while ($this->current()->type !== TokenType::EOF && @@ -246,7 +311,7 @@ class Parser { $content .= $this->consume()->data; } $rbrace = $this->consume(); - assert($rbrace->type === TokenType::RBRACE, "expected right brace, got ".$rbrace->type->name); + self::Assert($rbrace->type === TokenType::RBRACE, $rbrace, "expected right brace, got ".$rbrace->type->name); $attributes = array_map(function($element) { return trim($element); @@ -299,12 +364,12 @@ class Parser { // then we except an asterisk or a number followed by a period if ($type === ListType::UNORDERED) { $asterisk = $this->consume(); - assert($asterisk->type === TokenType::ASTERISK, "expected asterisk, got ".$asterisk->type->name); + self::Assert($asterisk->type === TokenType::ASTERISK, $asterisk, "expected asterisk, got ".$asterisk->type->name); } else { $number = $this->consume(); - assert($number->type === TokenType::NUMBER, "expected number, got ".$number->type->name); + self::Assert($number->type === TokenType::NUMBER, $number, "expected number, got ".$number->type->name); $period = $this->consume(); - assert($period->type === TokenType::DOT, "expected period, got ".$period->type->name); + self::Assert($period->type === TokenType::DOT, $period, "expected period, got ".$period->type->name); } // then we parse the node content @@ -317,7 +382,7 @@ class Parser { // if so, we want to append a sub list to the current item // here should be a EOL - assert($this->current()->type === TokenType::EOL, "expected EOL, got ".$this->current()->type->name); + self::Assert($this->current()->type === TokenType::EOL, $this->current(), "expected EOL, got ".$this->current()->type->name); $this->consume(); $nextLevel = 0;