a recusive descent markdown parser in PHP
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

548 lines
16 KiB

<?php declare(strict_types=1);
namespace parkdown;
use DOMDocument;
use DOMElement;
use DOMNode;
enum ListType {
case ORDERED;
case UNORDERED;
}
class Parser {
const MAGIC_CHAR = "*";
private array $tokenStream;
private int $pointer;
private DOMDocument $document;
private array $references;
public function __construct(array $tokenStream) {
$this->tokenStream = $tokenStream;
$this->pointer = 0;
$this->document = new DOMDocument();
$this->references = [];
}
private function current() : Token {
return $this->peek();
}
private function next() : Token {
return $this->peek(1);
}
private function last() : Token {
return $this->peek(-1);
}
private function peek(int $amount = 0) : Token {
$amount += $this->pointer;
if ($amount < 0 || $amount >= count($this->tokenStream))
return new Token(TokenType::EOF);
return $this->tokenStream[$amount];
}
private function consume() : Token {
$char = $this->current();
$this->pointer++;
return $char;
}
private static function StripBackslashes(string $text) : string {
return stripslashes($text);
}
private function resolveReferences(DOMElement $node) : void {
if (count($this->references) < 1)
return;
if ($node->hasAttribute("href")) {
$href = $node->getAttribute("href");
if (substr($href, 0, 1) === self::MAGIC_CHAR) {
$index = substr($href, 1, strlen($href) - 2);
if (array_key_exists($index, $this->references))
$node->setAttribute("href", $this->references[$index]);
}
}
if ($node->hasChildNodes()) {
foreach ($node->childNodes as $child) {
if ($child->nodeType === XML_ELEMENT_NODE)
$this->resolveReferences($child);
}
}
}
public function debug() : void {
echo "<pre>";
print_r($this->tokenStream);
echo "</pre>";
}
// PARSING
private function parseBold() : DOMNode {
$buffer = "";
while ($this->current()->type !== TokenType::ASTERISK && $this->current()->type !== TokenType::EOL) {
$buffer .= $this->consume()->data;
}
$this->consume();
$this->consume();
return $this->document->createElement("b", $buffer);
}
private function parseItalic() : DOMNode {
$buffer = "";
while ($this->current()->type !== TokenType::ASTERISK && $this->current()->type !== TokenType::EOL) {
$buffer .= $this->consume()->data;
}
$this->consume();
return $this->document->createElement("i", $buffer);
}
private function parseCode() : DOMNode {
$buffer = "";
$this->consume();
while ($this->current()->type !== TokenType::BACKTICK && $this->current()->type !== TokenType::EOL)
$buffer .= $this->consume()->data;
$this->consume();
return $this->document->createElement("code", $buffer);
}
private function parseLink() : ?DOMNode {
$text = "";
$consumption = 1;
$lbracket = $this->consume();
while (!($this->current()->type === TokenType::RBRACKET || $this->current()->type === TokenType::EOL)) {
$text .= $this->consume()->data;
$consumption++;
}
$rbracket = $this->consume();
$consumption++;
if ($this->current()->type !== TokenType::LBRACKET && $this->current()->type !== TokenType::LPAREN) {
$this->pointer -= $consumption;
return null;
}
$lbracketOrParen = $this->consume();
$index = "";
while (!($this->current()->type === TokenType::RBRACKET || $this->current()->type === TokenType::RPAREN || $this->current()->type === TokenType::EOL))
$index .= $this->consume()->data;
$rbracket = $this->consume();
$elm = $this->document->createElement("a", $text);
$href = $lbracketOrParen->type === TokenType::LPAREN ? $index :
(array_key_exists($index, $this->references) ? $this->references[$index] : self::MAGIC_CHAR.$index.self::MAGIC_CHAR);
$elm->setAttribute("href", $href);
return $elm;
}
private function parseText($paragraph = false) : array {
$elms = [];
$buffer = "";
$clearBuffer = function() use (&$elms, &$buffer) {
array_push($elms, $this->document->createTextNode($buffer));
$buffer = "";
};
while ($this->current()->type !== TokenType::EOF) {
if ((!$paragraph && $this->current()->type === TokenType::EOL) ||
($paragraph && ($this->current()->type === TokenType::EOL && $this->next()->type === TokenType::EOL)) || $this->current()->type === TokenType::EOF)
break;
if ($this->current()->type === TokenType::BACKSLASH && in_array($this->next()->type, [
TokenType::BACKTICK,
TokenType::ASTERISK,
TokenType::LBRACKET,
TokenType::BANG
])) {
$this->consume()->data; // backslash
$buffer .= $this->consume()->data;
continue;
} elseif ($this->current()->type === TokenType::ASTERISK) {
$clearBuffer();
if ($this->next()->type === TokenType::ASTERISK) {
$this->consume();
$this->consume();
array_push($elms, $this->parseBold());
} else {
$this->consume();
array_push($elms, $this->parseItalic());
}
continue;
} elseif ($this->current()->type === TokenType::BACKTICK) {
$clearBuffer();
array_push($elms, $this->parseCode());
continue;
} elseif ($this->current()->type === TokenType::LBRACKET) {
$links = $this->parseLink();
if ($links !== null) {
$clearBuffer();
array_push($elms, $links);
} else {
$buffer .= self::StripBackslashes($this->consume()->data);
continue;
}
continue;
} elseif ($this->current()->type === TokenType::BANG) {
$bang = $this->consume();
if ($this->current()->type !== TokenType::LBRACKET) {
$buffer .= self::StripBackslashes($this->consume()->data);
continue;
}
$lbracket = $this->consume();
$alt = "";
while ($this->current()->type !== TokenType::RBRACKET && $this->current()->type !== TokenType::EOL)
$alt .= self::StripBackslashes($this->consume()->data);
if ($this->current()->type !== TokenType::RBRACKET || $this->next()->type !== TokenType::LPAREN) {
$buffer .= "!";
$this->pointer -= strlen($alt) + 1;
continue;
}
$rbracket = $this->consume();
$lparen = $this->consume();
$src = "";
while ($this->current()->type !== TokenType::RPAREN && $this->current()->type !== TokenType::EOL)
$src .= $this->consume()->data;
if ($this->current()->type !== TokenType::RPAREN) {
$buffer .= "](";
$this->pointer -= strlen($alt) + 1;
continue;
}
$rparen = $this->consume();
$elm = $this->document->createElement("img");
if (strlen($alt) > 0)
$elm->setAttribute("alt", $alt);
$elm->setAttribute("src", $src);
$clearBuffer();
array_push($elms, $elm);
continue;
} else
$buffer .= self::StripBackslashes($this->consume()->data);
}
if (strlen($buffer) > 0)
array_push($elms, $this->document->createTextNode($buffer));
return $elms;
}
private function parseList(ListType $type = ListType::UNORDERED, int $level = 0) : DOMNode {
$list = $this->document->createElement($type === ListType::UNORDERED ? "ul" : "ol");
while ($this->current()->type !== TokenType::EOF &&
($this->current()->type !== TokenType::EOL && $this->next()->type !== TokenType::EOL)) {
// if we encounter a single linebreak, we are done with the current item
if ($this->current()->type === TokenType::EOL) {
$this->consume();
}
// first we remove leading tabs
while ($this->current()->type === TokenType::TAB)
$this->consume();
if ($this->current()->type === TokenType::EOF)
break;
// then we except an asterisk or a number followed by a period
if ($type === ListType::UNORDERED) {
$asterisk = $this->consume();
assert($asterisk->type === TokenType::ASTERISK, "expected asterisk, got ".$asterisk->type->name);
} else {
$number = $this->consume();
assert($number->type === TokenType::NUMBER, "expected number, got ".$number->type->name);
$period = $this->consume();
assert($period->type === TokenType::DOT, "expected period, got ".$period->type->name);
}
// then we parse the node content
$elm = $this->document->createElement("li");
foreach ($this->parseText() as $node)
$elm->appendChild($node);
// now we check, if the level of the next line is higher than the current level.
// if so, we want to append a sub list to the current item
// here should be a EOL
assert($this->current()->type === TokenType::EOL, "expected EOL, got ".$this->current()->type->name);
$this->consume();
$nextLevel = 0;
while ($this->current()->type === TokenType::TAB) {
$this->consume();
$nextLevel++;
}
// reset pointer, as we did not really want to consume the tokens, but did for
// convenience
$this->pointer -= $nextLevel;
if ($nextLevel > $level)
$elm->appendChild($this->parseList($type, $nextLevel));
// then we append the list item to the list
$list->appendChild($elm);
// if next level is lower than current, we are done with the current sub list
if ($nextLevel < $level)
break;
}
return $list;
}
private function buildParagraph(array $elms) : void {
if (count($elms) < 1)
return;
$elm = $this->document->createElement("p");
$i = 0;
foreach ($elms as $node) {
if ($node->nodeName === "#text" && trim($node->textContent) === "")
continue;
$elm->appendChild($node);
$i++;
}
if ($i < 1)
return;
$this->document->appendChild($elm);
}
private function parseHeading() : void {
$level = 0;
while ($this->current()->type === TokenType::HASH) {
$level++;
$this->consume();
}
$elm = $this->document->createElement("h".$level);
foreach ($this->parseText() as $node)
$elm->appendChild($node);
$this->document->appendChild($elm);
}
private function parseCodeBlock() : void {
if (!($this->next()->type === TokenType::BACKTICK && $this->peek(2)->type === TokenType::BACKTICK)) {
$this->buildParagraph($this->parseText());
return;
}
$this->consume();
$this->consume();
$this->consume(); // ```
$lang = $this->parseText();
$lang = count($lang) > 0 ? trim($lang[0]->data) : null;
$container = $this->document->createElement("pre");
if ($lang)
$container->setAttribute("data-lang", $lang);
$buffer = "";
while (!($this->current()->type === TokenType::BACKTICK &&
$this->next()->type === TokenType::BACKTICK &&
$this->peek(2)->type === TokenType::BACKTICK) && $this->current()->type !== TokenType::EOF) {
$buffer .= self::StripBackslashes($this->consume()->data);
}
if ($this->current()->type !== TokenType::EOF) {
$this->consume();
$this->consume();
$this->consume();
}
$elm = $this->document->createElement("code", htmlspecialchars($buffer));
$container->appendChild($elm);
$this->document->appendChild($container);
$this->consume();
}
private function parseReference() : void {
if (($this->next()->type !== TokenType::NUMBER && $this->next()->type !== TokenType::TEXT) ||
$this->peek(2)->type !== TokenType::RBRACKET ||
$this->peek(3)->type !== TokenType::COLON) {
$this->buildParagraph($this->parseText());
return;
}
$lbracket = $this->consume();
$index = $this->consume()->data;
$rbracket = $this->consume();
$colon = $this->consume();
$buffer = "";
while ($this->current()->type !== TokenType::EOL && $this->current()->type !== TokenType::EOF) {
$buffer .= $this->consume()->data;
}
$this->consume();
$this->references[$index] = trim($buffer);
}
private function parseTableHead(string $nodeName = "th", ?array $props = null) : DOMNode {
$elm = $this->document->createElement("tr");
$i = 0;
while ($this->current()->type !== TokenType::EOL && $this->current()->type !== TokenType::EOF) {
$pipe = $this->consume();
$buffer = "";
while ($this->current()->type !== TokenType::PIPE && $this->current()->type !== TokenType::EOL)
$buffer .= $this->consume()->data;
if ($buffer === "")
continue;
$col = $this->document->createElement($nodeName, $buffer);
if ($props)
$col->setAttribute("style", "text-align: ".$props[$i]);
$elm->appendChild($col);
$i++;
}
return $elm;
}
private function parseTableRow(array $props) : DOMNode {
return $this->parseTableHead("td", $props);
}
private function parseTableAlignment() : array {
$props = [];
$this->consume(); // EOL
while ($this->current()->type !== TokenType::EOL && $this->current()->type !== TokenType::EOF) {
$pipe = $this->consume();
$buffer = "";
while ($this->current()->type !== TokenType::PIPE && $this->current()->type !== TokenType::EOL)
$buffer .= $this->consume()->data;
if ($buffer === "")
continue;
$firstChar = substr($buffer, 0, 1);
$lastChar = substr($buffer, strlen($buffer) - 1, 1);
if ($firstChar === ':' && $lastChar !== ':')
array_push($props, "left");
elseif ($lastChar === ':' && $firstChar !== ':')
array_push($props, "right");
elseif ($firstChar === ':' && $lastChar === ':')
array_push($props, "center");
else
array_push($props, "left");
}
return $props;
}
private function parseTable() : void {
$elm = $this->document->createElement("table");
$head = $this->parseTableHead();
$props = $this->parseTableAlignment();
$i = 0;
foreach($head->childNodes as $col) {
$col->setAttribute("style", "text-align: ".$props[$i]);
$i++;
}
$elm->appendChild($head);
while ($this->current()->type === TokenType::EOL && $this->next()->type === TokenType::PIPE) {
$this->consume(); // EOL
$elm->appendChild($this->parseTableRow($props));
}
$this->document->appendChild($elm);
}
private function parseBlockQuote() : void {
if (!str_starts_with($this->next()->data, " ")) {
$this->buildParagraph($this->parseText());
return;
}
$buffer = "";
$elm = $this->document->createElement("blockquote", $buffer);
while (!($this->current()->type === TokenType::EOF) && !($this->current()->type === TokenType::EOL && $this->next()->type !== TokenType::GT)) {
$gt = $this->consume();
if ($this->current()->type === TokenType::EOL) {
$this->consume();
$line = $this->document->createTextNode($buffer);
$br = $this->document->createElement("br");
$buffer = "";
$elm->appendChild($line);
$elm->appendChild($br);
continue;
}
$buffer .= $this->current()->data;
}
$this->document->appendChild($elm);
}
private function parseHorizontalRule() : void {
if (!($this->next()->type === TokenType::MINUS &&
$this->peek(2)->type === TokenType::MINUS &&
$this->peek(3)->type === TokenType::EOL)) {
$this->buildParagraph($this->parseText());
return;
}
$this->consume(); // -
$this->consume(); // -
$this->consume(); // -
$this->consume(); // EOL
$elm = $this->document->createElement("hr");
$this->document->appendChild($elm);
}
public function parse() : DOMDocument {
while ($this->current()->type !== TokenType::EOF) {
switch($this->current()->type) {
case TokenType::ASTERISK:
$list = $this->parseList();
$this->document->appendChild($list);
break;
case TokenType::HASH:
$this->parseHeading();
break;
case TokenType::NUMBER:
$list = $this->parseList(ListType::ORDERED);
$this->document->appendChild($list);
break;
case TokenType::BACKTICK:
$this->parseCodeBlock();
break;
case TokenType::EOL:
$this->consume();
break;
case TokenType::LBRACKET:
$this->parseReference();
break;
case TokenType::PIPE:
$this->parseTable();
break;
case TokenType::GT:
$this->parseBlockQuote();
break;
case TokenType::MINUS:
$this->parseHorizontalRule();
break;
case TokenType::TEXT:
default:
$this->buildParagraph($this->parseText(true));
break;
}
}
foreach($this->document->childNodes as $node)
$this->resolveReferences($node);
return $this->document;
}
}