-
Notifications
You must be signed in to change notification settings - Fork 199
Examples
By default, only URIs that begin with http
or https
are allowed. This means that markup containing mailto links will have their URIs stripped when sanitized. To avoid this, add mailto
as an allowed scheme:
var sanitizer = new HtmlSanitizer();
sanitizer.AllowedSchemes.Add("mailto");
By default, only URIs that begin with http
or https
are allowed. This means that markup containing data-URI links will have their URIs stripped when sanitized. To avoid this, add data
as an allowed scheme:
var sanitizer = new HtmlSanitizer();
sanitizer.AllowedSchemes.Add("data");
The default render mode for sanitized markup is HTML, which means void tags will be normalized to their non-self-closing version. If you require XML-style void tags, use an XhtmlMarkupFormatter.
You can specify a formatter for a specific call to Sanitize()
:
var sanitizer = new HtmlSanitizer();
var formatter = AngleSharp.Xhtml.XhtmlMarkupFormatter.Instance;
var content = @"<p>This image is self-closing: <img src=""some-image.png"" /></p>";
sanitizer.Sanitize(content, formatter);
// With default formatter:
// -> <p>This image is self-closing: <img src="some-image.png"></p>
// With XhtmlMarkupFormatter:
// -> <p>This image is self-closing: <img src="some-image.png" /></p>
or you can set the formatter used for all calls to Sanitize()
:
var formatter = AngleSharp.Xhtml.XhtmlMarkupFormatter.Instance;
HtmlSanitizer.DefaultOutputFormatter = formatter;
var sanitizer = new HtmlSanitizer();
var content = @"<p>This image is self-closing: <img src=""some-image.png"" /></p>";
sanitizer.Sanitize(content);
// With default formatter:
// -> <p>This image is self-closing: <img src="some-image.png"></p>
// With XhtmlMarkupFormatter:
// -> <p>This image is self-closing: <img src="some-image.png" /></p>
Sometimes you have to deal with arbitrary input that's not HTML but rather some form of tag soup. In these cases you can pre-process your input to encode all opening angle brackets which are not part of actual HTML tags. See also the discussion at #91 and #126.
static Regex HtmlRegex = new Regex(@"</?([a-z]+[1-6]?)", RegexOptions.IgnoreCase);
static HashSet<string> HtmlTags = new HashSet<string>(StringComparer.OrdinalIgnoreCase) { "a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio", "b", "base", "bdi", "bdo", "big", "blockquote", "body", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "command", "datalist", "dd", "del", "details", "dfn", "dir", "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "iframe", "img", "input", "ins", "isindex", "kbd", "keygen", "label", "legend", "li", "link", "map", "mark", "menu", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr" };
private string Sanitize(string text)
{
text = HtmlRegex.Replace(text, m =>
{
var tagName = m.Groups[1].Value;
if (!HtmlTags.Contains(tagName))
return "<" + m.Value.Substring(1);
return m.Value;
});
var sanitized = Sanitizer.Sanitize(text).Replace("\n", "<br>");
return sanitized;
}
Similar code code can be used to add, remove, or modify other attributes, e.g. add rel="nofollow"
etc. See also the discussion at #96.
sanitizer.PostProcessNode += (sender, e) =>
(e.Node as IHtmlAnchorElement)?.Target = "_blank";
Other rels which may be of interest include external
, sponsored
and ugc
(user-generated content).
Keeps all the CSS classes prefixed with h-
or p-
which is useful for preserving CSS classes used for microformats such as h-card.
sanitizer.RemovingCssClass += (sender, e) => e.Cancel = e.CssClass.StartsWith("h-") || e.CssClass.StartsWith("p-");
Keeps all the data attributes prefixed with bs-
which is useful for preserving the data attributes used by Bootstrap.
sanitizer.RemovingAttribute += (sender, e) => e.Cancel = e.Attribute.Name.StartsWith("data-bs-");
Add "external" rel to external links.
sanitizer.PostProcessNode += (sender, e) =>
{
if (e.Node is IHtmlAnchorElement a && a.HostName != "www.example.com")
{
a.RelationList.Add("external");
a.RelationList.Add("nofollow");
}
}
Remove query parameters used for referral tracking in affiliate programs. DuckDuckGo have a list of tracking parameters.
sanitizer.PostProcessNode += (sender, e) =>
{
if (e.Node is IHtmlAnchorElement a && a.HostName.Contains("amazon."))
{
var search = new UrlSearchParams(a.Search);
search.Delete("ref");
search.Delete("tag");
a.Search = search;
}
}
Remove query parameters used by Google Analytics.
sanitizer.PostProcessNode += (sender, e) =>
{
if (e.Node is IHtmlAnchorElement a)
{
var search = new UrlSearchParams(a.Search);
search.Delete("utm_source");
search.Delete("utm_medium");
search.Delete("utm_campaign");
search.Delete("utm_term");
search.Delete("utm_content");
a.Search = search;
}
}
Prevents the use of the img
, audio
and video
element to load resources that are not on your CDN.
sanitizer.RemovingTag += (sender, e) =>
{
if (e.Tag is IHtmlVideoElement v && v.Source)
{
if (!v.Source.StartsWith("https://your-trusted-cdn.example/"))
{
e.Cancel = true;
}
}
}
Keeps HTML comments that fit certain criteria. This example keeps the <!-- pagebreak -->
comment.
sanitizer.RemovingComment += (sender, e) =>
{
if (e.Comment.Data == "pagebreak")
{
e.Cancel = false;
}
}
Replace links with alternative services such as Twitter with Nitter or YouTube with Piped.
sanitizer.PostProcessNode += (sender, e) =>
{
if (e.Node is IHtmlAnchorElement a)
{
if (a.HostName.Contains("twitter.com"))
{
a.Href = a.Href.replace('twitter.com', 'nitter.net');
}
else if (a.HostName.Contains("youtube.com"))
{
a.Href = a.Href.replace('youtube.com', 'piped.kavin.rocks');
}
}
}