Skip to content

Commit

Permalink
Merge pull request #527 from getformwork/feature/sanitizer
Browse files Browse the repository at this point in the history
HTML/SVG Sanitizer
  • Loading branch information
giuscris authored Jun 23, 2024
2 parents 221aad5 + 6ddbbd3 commit 9f8fb4c
Show file tree
Hide file tree
Showing 12 changed files with 1,150 additions and 9 deletions.
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
"symfony/polyfill-ctype": "^1.23",
"symfony/yaml": "^7.0.3",
"league/commonmark": "^2.4",
"jaybizzle/crawler-detect": "^1.2"
"jaybizzle/crawler-detect": "^1.2",
"masterminds/html5": "^2.9"
},
"require-dev": {
"friendsofphp/php-cs-fixer": "^3.0",
Expand Down
69 changes: 68 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 9 additions & 7 deletions formwork/src/Parsers/Markdown.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

use Formwork\App;
use Formwork\Parsers\Extensions\CommonMark\LinkBaseExtension;
use Formwork\Sanitizer\HtmlSanitizer;
use League\CommonMark\Environment\Environment;
use League\CommonMark\Extension\CommonMark\CommonMarkCoreExtension;
use League\CommonMark\Extension\DisallowedRawHtml\DisallowedRawHtmlExtension;
use League\CommonMark\Extension\Table\TableExtension;
use League\CommonMark\MarkdownConverter;

Expand All @@ -22,19 +22,21 @@ public static function parse(string $input, array $options = []): string
$safeMode = App::instance()->config()->get('system.content.safeMode', true);

$environment = new Environment([
'html_input' => $safeMode ? 'escape' : 'allow',
'allow_unsafe_links' => false,
'max_nesting_level' => 10,
'formwork' => $options,
'html_input' => $safeMode ? 'escape' : 'allow',
'max_nesting_level' => 10,
'formwork' => $options,
]);

$environment->addExtension(new CommonMarkCoreExtension());
$environment->addExtension(new TableExtension());
$environment->addExtension(new LinkBaseExtension());
$environment->addExtension(new DisallowedRawHtmlExtension());

$markdownConverter = new MarkdownConverter($environment);

return $markdownConverter->convert($input);
$html = $markdownConverter->convert($input);

$htmlSanitizer = new HtmlSanitizer();

return $htmlSanitizer->sanitize($html);
}
}
213 changes: 213 additions & 0 deletions formwork/src/Sanitizer/DomSanitizer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
<?php

namespace Formwork\Sanitizer;

use DOMAttr;
use DOMDocumentFragment;
use DOMElement;
use DOMNode;
use DOMNodeList;
use DOMText;
use Formwork\Sanitizer\Parser\DomParserInterface;
use Formwork\Sanitizer\Parser\Html5Parser;
use Formwork\Utils\Str;
use Formwork\Utils\Uri;
use InvalidArgumentException;
use RuntimeException;
use UnexpectedValueException;

class DomSanitizer
{
/**
* @var list<string>
*/
protected array $allowedElements = [];

/**
* @var list<string>
*/
protected array $allowedAttributes = [];

/**
* @var list<string>
*/
protected array $uriAttributes = [];

/**
* @var list<string>
*/
protected array $allowedUriSchemes = ['http', 'https'];

/**
* @var array<string, class-string<DomSanitizer>>
*/
protected array $elementSanitizers = [];

protected SanitizeElementsMethod $sanitizeElementsMethod = SanitizeElementsMethod::Remove;

protected bool $allowExternalUris = true;

public function __construct(protected DomParserInterface $domParser = new Html5Parser())
{
}

public function sanitizeElementsMethod(SanitizeElementsMethod $sanitizeElementsMethod): static
{
$this->sanitizeElementsMethod = $sanitizeElementsMethod;
return $this;
}

public function disallowExternalUris(): static
{
$this->allowExternalUris = false;
return $this;
}

/**
* @param list<string> $elements
*/
public function allowedElements(array $elements): static
{
$this->allowedElements = $elements;
return $this;
}

/**
* @param list<string> $attributes
*/
public function allowedAttributes(array $attributes): static
{
$this->allowedAttributes = $attributes;
return $this;
}

/**
* @param list<string> $schemes
*/
public function allowedUriSchemes(array $schemes): static
{
$this->allowedUriSchemes = $schemes;
return $this;
}

public function sanitize(string $data): string
{
if (!$this->isValidData($data)) {
throw new InvalidArgumentException('Invalid data to sanitize');
}

$dom = $this->domParser->parse($data);

if ($dom === null || !$this->isValidDocument($dom)) {
throw new UnexpectedValueException('Invalid parsed DOM document');
}

$this->sanitizeDocumentFragment($dom);

return $this->domParser->serialize($dom);
}

protected function isValidData(string $data): bool
{
return $data === '' || preg_match('//u', $data);
}

protected function isValidDocument(?DOMDocumentFragment $domDocumentFragment): bool
{
return $domDocumentFragment !== null;
}

protected function sanitizeDocumentFragment(DOMDocumentFragment $domDocumentFragment): void
{
$this->sanitizeNodes($domDocumentFragment->childNodes);
}

/**
* @param DOMNodeList<DOMNode> $domNodeList
*/
protected function sanitizeNodes(DOMNodeList $domNodeList): void
{
for ($i = $domNodeList->length; $i >= 0; $i--) {
$node = $domNodeList->item($i);

if (!($node instanceof DOMElement)) {
continue;
}

$this->sanitizeNode($node);
}
}

protected function sanitizeNode(DOMElement $domElement): void
{
if (!in_array($domElement->nodeName, $this->allowedElements, true)) {
if ($this->sanitizeElementsMethod === SanitizeElementsMethod::Escape) {
if ($domElement->parentNode === null) {
throw new UnexpectedValueException('Missing parent node');
}
$domElement->parentNode->replaceChild(new DOMText($this->domParser->serialize($domElement)), $domElement);
} else {
$domElement->remove();
}
return;
}

if (isset($this->elementSanitizers[$domElement->nodeName])) {
$sanitizer = $this->elementSanitizers[$domElement->nodeName];
(new $sanitizer())->sanitizeNode($domElement);
return;
}

if ($domElement->hasAttributes()) {
$this->sanitizeNodeAttributes($domElement);
}

if ($domElement->hasChildNodes()) {
$this->sanitizeNodes($domElement->childNodes);
}
}

protected function sanitizeNodeAttributes(DOMElement $domElement): void
{
$attributes = $domElement->attributes;

for ($i = $attributes->length; $i >= 0; $i--) {
$attribute = $attributes->item($i);

if (!($attribute instanceof DOMAttr)) {
continue;
}

if (!in_array($attribute->nodeName, $this->allowedAttributes, true)) {
$domElement->removeAttribute($attribute->nodeName);
}

if (in_array($attribute->nodeName, $this->uriAttributes, true)) {
$uri = $this->sanitizeUri((string) $attribute->nodeValue);

$scheme = Uri::scheme($uri);

if ($scheme === null && !Str::startsWith($uri, '//')) {
continue;
}

if (!$this->allowExternalUris || !in_array($scheme, $this->allowedUriSchemes, true)) {
$domElement->removeAttribute($attribute->nodeName);
}
}
}
}

protected function sanitizeUri(string $uri): string
{
$uri = urldecode($uri);

$uri = preg_replace('/&(?:#\d+|#[xX][a-fA-F0-9]+|AElig|AMP|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|GT|Iacute|Icirc|Igrave|Iuml|LT|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|QUOT|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|amp|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|gt|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|lt|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|quot|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml)(?!;)/', '$0;', $uri)
?? throw new RuntimeException('Cannot replace malformed HTML entities');

$uri = html_entity_decode($uri);

return filter_var($uri, FILTER_SANITIZE_URL)
?: throw new RuntimeException('Cannot sanitize URI');
}
}
26 changes: 26 additions & 0 deletions formwork/src/Sanitizer/HtmlSanitizer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?php

namespace Formwork\Sanitizer;

use Formwork\Sanitizer\Reference\HtmlReference;

class HtmlSanitizer extends DomSanitizer
{
protected array $allowedElements = HtmlReference::ALLOWED_ELEMENTS;

protected array $allowedAttributes = HtmlReference::ALLOWED_ATTRIBUTES;

protected array $uriAttributes = HtmlReference::URI_ATTRIBUTES;

/**
* @var list<string>
*/
protected array $allowedUriSchemes = ['http', 'https', 'mailto'];

/**
* @var array<string, class-string<DomSanitizer>>
*/
protected array $elementSanitizers = [
'svg' => SvgSanitizer::class,
];
}
13 changes: 13 additions & 0 deletions formwork/src/Sanitizer/Parser/DomParserInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?php

namespace Formwork\Sanitizer\Parser;

use DOMDocumentFragment;
use DOMNode;

interface DomParserInterface
{
public function parse(string $string): ?DOMDocumentFragment;

public function serialize(?DOMNode $domNode = null): string;
}
Loading

0 comments on commit 9f8fb4c

Please sign in to comment.