From abaa2201856749d677379591e503cf830401da78 Mon Sep 17 00:00:00 2001 From: Giuseppe Criscione <18699708+giuscris@users.noreply.github.com> Date: Sat, 8 Jun 2024 23:26:22 +0200 Subject: [PATCH 1/2] Add Sanitizer classes --- composer.json | 3 +- composer.lock | 69 +++- formwork/src/Sanitizer/DomSanitizer.php | 213 ++++++++++ formwork/src/Sanitizer/HtmlSanitizer.php | 26 ++ .../Sanitizer/Parser/DomParserInterface.php | 13 + formwork/src/Sanitizer/Parser/Html5Parser.php | 27 ++ .../src/Sanitizer/Parser/PhpDomParser.php | 36 ++ .../src/Sanitizer/Reference/HtmlReference.php | 363 ++++++++++++++++++ .../src/Sanitizer/Reference/SvgReference.php | 306 +++++++++++++++ .../src/Sanitizer/SanitizeElementsMethod.php | 9 + formwork/src/Sanitizer/SvgSanitizer.php | 78 ++++ 11 files changed, 1141 insertions(+), 2 deletions(-) create mode 100644 formwork/src/Sanitizer/DomSanitizer.php create mode 100644 formwork/src/Sanitizer/HtmlSanitizer.php create mode 100644 formwork/src/Sanitizer/Parser/DomParserInterface.php create mode 100644 formwork/src/Sanitizer/Parser/Html5Parser.php create mode 100644 formwork/src/Sanitizer/Parser/PhpDomParser.php create mode 100644 formwork/src/Sanitizer/Reference/HtmlReference.php create mode 100644 formwork/src/Sanitizer/Reference/SvgReference.php create mode 100644 formwork/src/Sanitizer/SanitizeElementsMethod.php create mode 100644 formwork/src/Sanitizer/SvgSanitizer.php diff --git a/composer.json b/composer.json index 00a75990..06f58dc1 100644 --- a/composer.json +++ b/composer.json @@ -25,7 +25,8 @@ "symfony/polyfill-ctype": "^1.23", "symfony/yaml": "^7.0.3", "league/commonmark": "^2.4", - "jaybizzle/crawler-detect": "^1.2" + "jaybizzle/crawler-detect": "^1.2", + "masterminds/html5": "^2.9" }, "require-dev": { "friendsofphp/php-cs-fixer": "^3.0", diff --git a/composer.lock b/composer.lock index 4bfa3fe3..8b12f224 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "55305a014811d06459a046c0f5edf7a1", + "content-hash": "47adfbe835636af680d4bcf21bbfe617", "packages": [ { "name": "dflydev/dot-access-data", @@ -321,6 +321,73 @@ ], "time": "2022-12-11T20:36:23+00:00" }, + { + "name": "masterminds/html5", + "version": "2.9.0", + "source": { + "type": "git", + "url": "https://github.com/Masterminds/html5-php.git", + "reference": "f5ac2c0b0a2eefca70b2ce32a5809992227e75a6" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Masterminds/html5-php/zipball/f5ac2c0b0a2eefca70b2ce32a5809992227e75a6", + "reference": "f5ac2c0b0a2eefca70b2ce32a5809992227e75a6", + "shasum": "" + }, + "require": { + "ext-dom": "*", + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7 || ^8 || ^9" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.7-dev" + } + }, + "autoload": { + "psr-4": { + "Masterminds\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Butcher", + "email": "technosophos@gmail.com" + }, + { + "name": "Matt Farina", + "email": "matt@mattfarina.com" + }, + { + "name": "Asmir Mustafic", + "email": "goetas@gmail.com" + } + ], + "description": "An HTML5 parser and serializer.", + "homepage": "http://masterminds.github.io/html5-php", + "keywords": [ + "HTML5", + "dom", + "html", + "parser", + "querypath", + "serializer", + "xml" + ], + "support": { + "issues": "https://github.com/Masterminds/html5-php/issues", + "source": "https://github.com/Masterminds/html5-php/tree/2.9.0" + }, + "time": "2024-03-31T07:05:07+00:00" + }, { "name": "nette/schema", "version": "v1.3.0", diff --git a/formwork/src/Sanitizer/DomSanitizer.php b/formwork/src/Sanitizer/DomSanitizer.php new file mode 100644 index 00000000..6b42fe9e --- /dev/null +++ b/formwork/src/Sanitizer/DomSanitizer.php @@ -0,0 +1,213 @@ + + */ + protected array $allowedElements = []; + + /** + * @var list + */ + protected array $allowedAttributes = []; + + /** + * @var list + */ + protected array $uriAttributes = []; + + /** + * @var list + */ + protected array $allowedUriSchemes = ['http', 'https']; + + /** + * @var array> + */ + protected array $elementSanitizers = []; + + protected SanitizeElementsMethod $sanitizeElementsMethod = SanitizeElementsMethod::Remove; + + protected bool $allowExternalUris = true; + + public function __construct(protected DomParserInterface $domParser = new Html5Parser()) + { + } + + public function sanitizeElementsMethod(SanitizeElementsMethod $sanitizeElementsMethod): static + { + $this->sanitizeElementsMethod = $sanitizeElementsMethod; + return $this; + } + + public function disallowExternalUris(): static + { + $this->allowExternalUris = false; + return $this; + } + + /** + * @param list $elements + */ + public function allowedElements(array $elements): static + { + $this->allowedElements = $elements; + return $this; + } + + /** + * @param list $attributes + */ + public function allowedAttributes(array $attributes): static + { + $this->allowedAttributes = $attributes; + return $this; + } + + /** + * @param list $schemes + */ + public function allowedUriSchemes(array $schemes): static + { + $this->allowedUriSchemes = $schemes; + return $this; + } + + public function sanitize(string $data): string + { + if (!$this->isValidData($data)) { + throw new InvalidArgumentException('Invalid data to sanitize'); + } + + $dom = $this->domParser->parse($data); + + if ($dom === null || !$this->isValidDocument($dom)) { + throw new UnexpectedValueException('Invalid parsed DOM document'); + } + + $this->sanitizeDocumentFragment($dom); + + return $this->domParser->serialize($dom); + } + + protected function isValidData(string $data): bool + { + return $data === '' || preg_match('//u', $data); + } + + protected function isValidDocument(?DOMDocumentFragment $domDocumentFragment): bool + { + return $domDocumentFragment !== null; + } + + protected function sanitizeDocumentFragment(DOMDocumentFragment $domDocumentFragment): void + { + $this->sanitizeNodes($domDocumentFragment->childNodes); + } + + /** + * @param DOMNodeList $domNodeList + */ + protected function sanitizeNodes(DOMNodeList $domNodeList): void + { + for ($i = $domNodeList->length; $i >= 0; $i--) { + $node = $domNodeList->item($i); + + if (!($node instanceof DOMElement)) { + continue; + } + + $this->sanitizeNode($node); + } + } + + protected function sanitizeNode(DOMElement $domElement): void + { + if (!in_array($domElement->nodeName, $this->allowedElements, true)) { + if ($this->sanitizeElementsMethod === SanitizeElementsMethod::Escape) { + if ($domElement->parentNode === null) { + throw new UnexpectedValueException('Missing parent node'); + } + $domElement->parentNode->replaceChild(new DOMText($this->domParser->serialize($domElement)), $domElement); + } else { + $domElement->remove(); + } + return; + } + + if (isset($this->elementSanitizers[$domElement->nodeName])) { + $sanitizer = $this->elementSanitizers[$domElement->nodeName]; + (new $sanitizer())->sanitizeNode($domElement); + return; + } + + if ($domElement->hasAttributes()) { + $this->sanitizeNodeAttributes($domElement); + } + + if ($domElement->hasChildNodes()) { + $this->sanitizeNodes($domElement->childNodes); + } + } + + protected function sanitizeNodeAttributes(DOMElement $domElement): void + { + $attributes = $domElement->attributes; + + for ($i = $attributes->length; $i >= 0; $i--) { + $attribute = $attributes->item($i); + + if (!($attribute instanceof DOMAttr)) { + continue; + } + + if (!in_array($attribute->nodeName, $this->allowedAttributes, true)) { + $domElement->removeAttribute($attribute->nodeName); + } + + if (in_array($attribute->nodeName, $this->uriAttributes, true)) { + $uri = $this->sanitizeUri((string) $attribute->nodeValue); + + $scheme = Uri::scheme($uri); + + if ($scheme === null && !Str::startsWith($uri, '//')) { + continue; + } + + if (!$this->allowExternalUris || !in_array($scheme, $this->allowedUriSchemes, true)) { + $domElement->removeAttribute($attribute->nodeName); + } + } + } + } + + protected function sanitizeUri(string $uri): string + { + $uri = urldecode($uri); + + $uri = preg_replace('/&(?:#\d+|#[xX][a-fA-F0-9]+|AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml)(?!;)/', '$0;', $uri) + ?? throw new RuntimeException('Cannot replace malformed HTML entities'); + + $uri = html_entity_decode($uri); + + return filter_var($uri, FILTER_SANITIZE_URL) + ?: throw new RuntimeException('Cannot sanitize URI'); + } +} diff --git a/formwork/src/Sanitizer/HtmlSanitizer.php b/formwork/src/Sanitizer/HtmlSanitizer.php new file mode 100644 index 00000000..5f89da44 --- /dev/null +++ b/formwork/src/Sanitizer/HtmlSanitizer.php @@ -0,0 +1,26 @@ + + */ + protected array $allowedUriSchemes = ['http', 'https', 'mailto']; + + /** + * @var array> + */ + protected array $elementSanitizers = [ + 'svg' => SvgSanitizer::class, + ]; +} diff --git a/formwork/src/Sanitizer/Parser/DomParserInterface.php b/formwork/src/Sanitizer/Parser/DomParserInterface.php new file mode 100644 index 00000000..6dbd1f24 --- /dev/null +++ b/formwork/src/Sanitizer/Parser/DomParserInterface.php @@ -0,0 +1,13 @@ +dom = new HTML5(); + } + + public function parse(string $string): ?DOMDocumentFragment + { + return $this->dom->loadHTMLFragment($string); + } + + public function serialize(?DOMNode $domNode = null): string + { + return $this->dom->saveHTML($domNode); + } +} diff --git a/formwork/src/Sanitizer/Parser/PhpDomParser.php b/formwork/src/Sanitizer/Parser/PhpDomParser.php new file mode 100644 index 00000000..c1c9d2da --- /dev/null +++ b/formwork/src/Sanitizer/Parser/PhpDomParser.php @@ -0,0 +1,36 @@ +dom = new DOMDocument(); + } + + public function parse(string $string): ?DOMDocumentFragment + { + $domDocumentFragment = $this->dom->createDocumentFragment(); + + if (!@$domDocumentFragment->appendXML($string)) { + return null; + } + + return $domDocumentFragment; + } + + public function serialize(?DOMNode $domNode = null): string + { + if ($domNode instanceof DOMDocument) { + $domNode = $domNode->firstElementChild; + } + return (string) $this->dom->saveXML($domNode); + } +} diff --git a/formwork/src/Sanitizer/Reference/HtmlReference.php b/formwork/src/Sanitizer/Reference/HtmlReference.php new file mode 100644 index 00000000..c6813c9e --- /dev/null +++ b/formwork/src/Sanitizer/Reference/HtmlReference.php @@ -0,0 +1,363 @@ +childElementCount > 1) { + return false; + } + + return $domDocumentFragment->firstElementChild?->nodeName === 'svg'; + } + + protected function sanitizeDocumentFragment(DOMDocumentFragment $domDocumentFragment): void + { + parent::sanitizeDocumentFragment($domDocumentFragment); + $this->addExplicitSvgNamespace($domDocumentFragment); + } + + protected function addExplicitSvgNamespace(DOMDocumentFragment $domDocumentFragment): void + { + $svg = $domDocumentFragment->firstElementChild; + + if (!($svg instanceof DOMElement)) { + throw new UnexpectedValueException('Invalid SVG document'); + } + + if ($svg->namespaceURI === SvgReference::NAMESPACE_URI) { + return; + } + + $document = $domDocumentFragment->ownerDocument; + + if (!($document instanceof DOMDocument)) { + throw new UnexpectedValueException('Unexpected missing SVG DOM document'); + } + + $root = $document->createElementNS(SvgReference::NAMESPACE_URI, 'svg'); + + $attributes = $svg->attributes; + + for ($i = $attributes->length; $i >= 0; $i--) { + $attribute = $attributes->item($i); + if ($attribute instanceof DOMAttr) { + $root->setAttributeNode($attribute); + } + } + + $root->append(...$svg->childNodes); + + $svg->replaceWith($root); + } +} From 6ddbbd3fb934efee92ae835c6bb4e8e474b44a36 Mon Sep 17 00:00:00 2001 From: Giuseppe Criscione <18699708+giuscris@users.noreply.github.com> Date: Sat, 15 Jun 2024 14:27:33 +0200 Subject: [PATCH 2/2] Sanitize markdown output --- formwork/src/Parsers/Markdown.php | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/formwork/src/Parsers/Markdown.php b/formwork/src/Parsers/Markdown.php index 100d34cf..a60938c0 100644 --- a/formwork/src/Parsers/Markdown.php +++ b/formwork/src/Parsers/Markdown.php @@ -4,9 +4,9 @@ use Formwork\App; use Formwork\Parsers\Extensions\CommonMark\LinkBaseExtension; +use Formwork\Sanitizer\HtmlSanitizer; use League\CommonMark\Environment\Environment; use League\CommonMark\Extension\CommonMark\CommonMarkCoreExtension; -use League\CommonMark\Extension\DisallowedRawHtml\DisallowedRawHtmlExtension; use League\CommonMark\Extension\Table\TableExtension; use League\CommonMark\MarkdownConverter; @@ -22,19 +22,21 @@ public static function parse(string $input, array $options = []): string $safeMode = App::instance()->config()->get('system.content.safeMode', true); $environment = new Environment([ - 'html_input' => $safeMode ? 'escape' : 'allow', - 'allow_unsafe_links' => false, - 'max_nesting_level' => 10, - 'formwork' => $options, + 'html_input' => $safeMode ? 'escape' : 'allow', + 'max_nesting_level' => 10, + 'formwork' => $options, ]); $environment->addExtension(new CommonMarkCoreExtension()); $environment->addExtension(new TableExtension()); $environment->addExtension(new LinkBaseExtension()); - $environment->addExtension(new DisallowedRawHtmlExtension()); $markdownConverter = new MarkdownConverter($environment); - return $markdownConverter->convert($input); + $html = $markdownConverter->convert($input); + + $htmlSanitizer = new HtmlSanitizer(); + + return $htmlSanitizer->sanitize($html); } }