Skip to content

Commit

Permalink
tool: scrub: implement interpret_noscript option, enabled by default
Browse files Browse the repository at this point in the history
This option inlines the contents of `noscript` tags when `scrub` is running
with `-scripts`.
  • Loading branch information
oxij committed Nov 19, 2024
1 parent 84a7c8a commit 54cbc93
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 0 deletions.
1 change: 1 addition & 0 deletions tool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ Compute output values by evaluating expressions `EXPR`s on a given reqres stored
- `(+|-)unknown` controls if the data with unknown content types should passed to the output unchanged or censored out (respectively); the default is `+unknown`, which keeps data of unknown content `MIME` types as-is;
- `(+|-)(styles|scripts|iepragmas|iframes|prefetches|tracking|navigations)` control which things should be kept in or censored out from `HTML`, `CSS`, and `JavaScript`; i.e. these options control whether `CSS` stylesheets (both separate files and `HTML` tags and attributes), `JavaScript` (both separate files and `HTML` tags and attributes), `HTML` Internet Explorer pragmas, `<iframe>` `HTML` tags, `HTML` content prefetch `link` tags, other tracking `HTML` tags and attributes (like `a ping` attributes), and automatic navigations (`Refresh` `HTTP` headers and `<meta http-equiv>` `HTML` tags) should be respectively kept in or censored out from the input; the default is `+styles,-scripts,-iepragmas,+iframes,-prefetches,-tracking,-navigations` which ensures the result does not contain `JavaScript` and will not produce any prefetch, tracking requests, or re-navigations elsewhere, when loaded in a web browser; `-iepragmas` is the default because censoring for contents of such pragmas is not supported yet;
- `(+|-)all_dyns` is equivalent to enabling or disabling all of the options listed in the previous item simultaneously;
- `(+|-)interpret_noscript` controls whether the contents of `noscript` tags should be inlined when `-scripts` is set, the default is `+interpret_noscript`;
- `(+|-)verbose` controls whether tag censoring controlled by the above options is to be reported in the output (as comments) or stuff should be wiped from existence without evidence instead; the default is `-verbose`;
- `(+|-)whitespace` controls whether `HTML` and `CSS` renderers should keep the original whitespace as-is or collapse it away (respectively); the default is `-whitespace`, which produces somewhat minimized outputs (because it saves a lot of space);
- `(+|-)optional_tags` controls whether `HTML` renderer should put optional `HTML` tags into the output or skip them (respectively); the default is `+optional_tags` (because many tools fail to parse minimized `HTML` properly);
Expand Down
16 changes: 16 additions & 0 deletions tool/hoardy_web/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def debug_walker(walker : _t.Iterator[HTML5Node]) -> _t.Iterator[HTML5Node]:
htmlns_ins = (htmlns, "ins")
htmlns_link = (htmlns, "link")
htmlns_meta= (htmlns, "meta")
htmlns_noscript = (htmlns, "noscript")
htmlns_object = (htmlns, "object")
htmlns_q = (htmlns, "q")
htmlns_script = (htmlns, "script")
Expand Down Expand Up @@ -255,6 +256,8 @@ class ScrubbingOptions:
prefetches : bool = _dc.field(default=False)
tracking : bool = _dc.field(default=False)
navigations : bool = _dc.field(default=False)
interpret_noscript : bool = _dc.field(default=True)

verbose : bool = _dc.field(default=False)
whitespace : bool = _dc.field(default=False)
optional_tags : bool = _dc.field(default=True)
Expand Down Expand Up @@ -377,6 +380,7 @@ def make_scrubbers(opts : ScrubbingOptions) -> Scrubbers:
not_iepragmas = not opts.iepragmas
not_iframes = not opts.iframes
yes_navigations = opts.navigations
yes_interpret_noscript = opts.interpret_noscript
yes_verbose = opts.verbose
not_verbose = not yes_verbose
not_whitespace = not opts.whitespace
Expand Down Expand Up @@ -630,6 +634,11 @@ def emit_censored_other(what : str) -> _t.Iterator[HTML5Node]:
nn = (token["namespace"], token["name"])
attrs : HTML5NodeAttrValues = token["data"]

if not_scripts and yes_interpret_noscript and nn == htmlns_noscript:
# ignore this
yield from emit_censored_token(typ, token)
continue

# Handle HTTP header inlining.
# Put them after `<base>`, `<title>`, and charset-controlling `<meta>` headers.
if inline_headers_undone and \
Expand Down Expand Up @@ -818,6 +827,13 @@ def emit_censored_other(what : str) -> _t.Iterator[HTML5Node]:
continue
base_url_unset = False

nn = (token["namespace"], token["name"])

if not_scripts and yes_interpret_noscript and nn == htmlns_noscript:
# ignore this
yield from emit_censored_token(typ, token)
continue

stack_len = len(stack)
if censor:
censor_lvl -= 1
Expand Down
4 changes: 4 additions & 0 deletions tool/hoardy_web/wrr.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ def envfunc(rrexpr : _t.Any, v : _t.Any) -> _t.Any:
- `(+|-)unknown` controls if the data with unknown content types should passed to the output unchanged or censored out (respectively); the default is `+unknown`, which keeps data of unknown content `MIME` types as-is;
- `(+|-)(styles|scripts|iepragmas|iframes|prefetches|tracking|navigations)` control which things should be kept in or censored out from `HTML`, `CSS`, and `JavaScript`; i.e. these options control whether `CSS` stylesheets (both separate files and `HTML` tags and attributes), `JavaScript` (both separate files and `HTML` tags and attributes), `HTML` Internet Explorer pragmas, `<iframe>` `HTML` tags, `HTML` content prefetch `link` tags, other tracking `HTML` tags and attributes (like `a ping` attributes), and automatic navigations (`Refresh` `HTTP` headers and `<meta http-equiv>` `HTML` tags) should be respectively kept in or censored out from the input; the default is `+styles,-scripts,-iepragmas,+iframes,-prefetches,-tracking,-navigations` which ensures the result does not contain `JavaScript` and will not produce any prefetch, tracking requests, or re-navigations elsewhere, when loaded in a web browser; `-iepragmas` is the default because censoring for contents of such pragmas is not supported yet;
- `(+|-)all_dyns` is equivalent to enabling or disabling all of the options listed in the previous item simultaneously;
- `(+|-)interpret_noscript` controls whether the contents of `noscript` tags should be inlined when `-scripts` is set, the default is `+interpret_noscript`;
- `(+|-)verbose` controls whether tag censoring controlled by the above options is to be reported in the output (as comments) or stuff should be wiped from existence without evidence instead; the default is `-verbose`;
- `(+|-)whitespace` controls whether `HTML` and `CSS` renderers should keep the original whitespace as-is or collapse it away (respectively); the default is `-whitespace`, which produces somewhat minimized outputs (because it saves a lot of space);
- `(+|-)optional_tags` controls whether `HTML` renderer should put optional `HTML` tags into the output or skip them (respectively); the default is `+optional_tags` (because many tools fail to parse minimized `HTML` properly);
Expand Down Expand Up @@ -901,6 +902,7 @@ def test_ReqresExpr_scrub_css() -> None:
<style>
{test_css_in1}
</style>
<noscript><link rel=stylesheet href="noscript.css"></noscript>
<script>x = 1;</script>
<script src="https://asset.example.com/inc1-asset.js"></script>
<script src="inc1-base.js"></script>
Expand Down Expand Up @@ -944,6 +946,7 @@ def test_ReqresExpr_scrub_html() -> None:
}
</style>
<!-- hoardy-web censored out StartTag noscript from here --><!-- hoardy-web censored out EmptyTag link stylesheet from here --><!-- hoardy-web censored out EndTag noscript from here -->
<!-- hoardy-web censored out AssembledTag script from here -->
<!-- hoardy-web censored out AssembledTag script from here -->
<!-- hoardy-web censored out AssembledTag script from here -->
Expand Down Expand Up @@ -999,6 +1002,7 @@ def test_ReqresExpr_scrub_html() -> None:
background: url(https://base.example.com/background.jpg); *zoom: 1;
}
</style>
<noscript><link rel=stylesheet href="https://base.example.com/noscript.css"></noscript>
<script>
x = 1;
</script>
Expand Down

0 comments on commit 54cbc93

Please sign in to comment.