From 46e0e552e59b93177a55d299d9075c7a07d85f7f Mon Sep 17 00:00:00 2001 From: Valian Date: Tue, 5 Dec 2023 17:05:08 +0100 Subject: [PATCH 1/3] improved performance by x100 for bigger pages by caching queries --- lib/readability/article_builder.ex | 20 ++++--- lib/readability/candidate/scoring.ex | 27 ++++----- lib/readability/candidate_finder.ex | 12 +++- lib/readability/helper.ex | 19 ------ lib/readability/queries.ex | 87 ++++++++++++++++++++++++++++ lib/readability/sanitizer.ex | 17 +++--- test/readability/helper_test.exs | 4 -- test/readability/queries_test.exs | 29 ++++++++++ 8 files changed, 158 insertions(+), 57 deletions(-) create mode 100644 lib/readability/queries.ex create mode 100644 test/readability/queries_test.exs diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex index 56d2817..948f76d 100644 --- a/lib/readability/article_builder.ex +++ b/lib/readability/article_builder.ex @@ -8,6 +8,7 @@ defmodule Readability.ArticleBuilder do alias Readability.Candidate.Scoring alias Readability.CandidateFinder alias Readability.Helper + alias Readability.Queries alias Readability.Sanitizer @type html_tree :: tuple | list @@ -35,19 +36,23 @@ defmodule Readability.ArticleBuilder do html_tree = Cleaner.transform_misused_div_to_p(html_tree) - candidates = CandidateFinder.find(html_tree, opts) + candidates = + html_tree + |> Queries.cache_stats_in_attributes() + |> CandidateFinder.find(opts) + article = find_article(candidates, html_tree) html_tree = Sanitizer.sanitize(article, candidates, opts) - if Helper.text_length(html_tree) < opts[:retry_length] do + if Queries.text_length(html_tree) < opts[:retry_length] do if opts = next_try_opts(opts) do build(origin_tree, opts) else - html_tree + Queries.clear_stats_from_attributes(html_tree) end else - html_tree + Queries.clear_stats_from_attributes(html_tree) end end @@ -75,7 +80,7 @@ defmodule Readability.ArticleBuilder do find_article_trees(best_candidate, candidates) else fallback_candidate = - case html_tree |> Floki.find("body") do + case html_tree |> Queries.find_tag("body") do [tree | _] -> %Candidate{html_tree: tree} _ -> %Candidate{html_tree: {}} end @@ -99,11 +104,10 @@ defmodule Readability.ArticleBuilder do defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do link_density = Scoring.calc_link_density(html_tree) - inner_text = html_tree |> Floki.text() - inner_length = inner_text |> String.length() + inner_length = Queries.text_length(html_tree) (inner_length > 80 && link_density < 0.25) || - (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/) + (inner_length < 80 && link_density == 0 && Floki.text(html_tree) =~ ~r/\.( |$)/) end defp append?(_), do: false diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex index 8e4e06a..ad55ab9 100644 --- a/lib/readability/candidate/scoring.ex +++ b/lib/readability/candidate/scoring.ex @@ -2,7 +2,7 @@ defmodule Readability.Candidate.Scoring do @moduledoc """ Score HTML tree. """ - alias Readability.Helper + alias Readability.Queries @element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5} @@ -27,9 +27,8 @@ defmodule Readability.Candidate.Scoring do defp calc_content_score(html_tree) do score = 1 - inner_text = html_tree |> Floki.text() - split_score = inner_text |> String.split(",") |> length - length_score = [String.length(inner_text) / 100, 3] |> Enum.min() + split_score = Queries.count_character(html_tree, ",") + 1 + length_score = min(Queries.text_length(html_tree) / 100, 3) score + split_score + length_score end @@ -58,27 +57,23 @@ defmodule Readability.Candidate.Scoring do end def calc_link_density(html_tree) do - link_length = - html_tree - |> Floki.find("a") - |> Floki.text() - |> String.length() - - text_length = - html_tree - |> Floki.text() - |> String.length() + text_length = Queries.text_length(html_tree) if text_length == 0 do 0 else + link_length = + html_tree + |> Queries.find_tag("a") + |> Queries.text_length() + link_length / text_length end end defp calc_children_content_score({_, _, children_tree}) do children_tree - |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1))) + |> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1))) |> calc_content_score end @@ -88,7 +83,7 @@ defmodule Readability.Candidate.Scoring do |> Enum.filter(&is_tuple(&1)) |> Enum.map(&elem(&1, 2)) |> List.flatten() - |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1))) + |> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1))) |> calc_content_score score / 2 diff --git a/lib/readability/candidate_finder.ex b/lib/readability/candidate_finder.ex index 3099892..8d10653 100644 --- a/lib/readability/candidate_finder.ex +++ b/lib/readability/candidate_finder.ex @@ -7,7 +7,7 @@ defmodule Readability.CandidateFinder do alias Readability.Candidate alias Readability.Candidate.Scoring - alias Readability.Helper + alias Readability.Queries @type html_tree :: tuple | list @type options :: list @@ -53,6 +53,14 @@ defmodule Readability.CandidateFinder do |> Enum.max_by(fn candidate -> candidate.score end) end + @doc """ + Check `html_tree` can be candidate or not. + """ + @spec candidate_tag?(html_tree) :: boolean + def candidate_tag?({tag, _, _} = html_tree) do + (tag == "p" || tag == "td") && Queries.text_length(html_tree) >= 25 + end + defp candidate?(_, depth \\ 0) defp candidate?(_, depth) when depth > 2, do: false defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth) @@ -60,7 +68,7 @@ defmodule Readability.CandidateFinder do defp candidate?(text, _) when is_binary(text), do: false defp candidate?({_, _, inner_tree} = html_tree, depth) do - if Helper.candidate_tag?(html_tree) do + if candidate_tag?(html_tree) do true else candidate?(inner_tree, depth + 1) diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex index 315de37..1f594c3 100644 --- a/lib/readability/helper.ex +++ b/lib/readability/helper.ex @@ -80,25 +80,6 @@ defmodule Readability.Helper do end end - @doc """ - Count only text length. - """ - @spec text_length(html_tree) :: number - def text_length(html_tree) do - html_tree |> Floki.text() |> String.trim() |> String.length() - end - - @doc """ - Check `html_tree` can be candidate or not. - """ - @spec candidate_tag?(html_tree) :: boolean - def candidate_tag?({tag, _, _} = html_tree) do - Enum.any?(["p", "td"], fn candidate_tag -> - tag == candidate_tag && - text_length(html_tree) >= Readability.default_options()[:min_text_length] - end) - end - @doc """ Normalizes and parses to HTML tree (tuple or list)) from binary HTML. """ diff --git a/lib/readability/queries.ex b/lib/readability/queries.ex new file mode 100644 index 0000000..45d4eee --- /dev/null +++ b/lib/readability/queries.ex @@ -0,0 +1,87 @@ +defmodule Readability.Queries do + @moduledoc """ + Highly-optimized utilities for quick answers about HTML tree + """ + + @type html_tree :: tuple | list + @type options :: list + + def cache_stats_in_attributes(html_tree) do + Floki.traverse_and_update(html_tree, fn + {tag, attrs, nodes} -> + attrs = + Keyword.put_new_lazy(attrs, :text_length, fn -> text_length({tag, attrs, nodes}) end) + + attrs = + Keyword.put_new_lazy(attrs, :commas, fn -> count_character({tag, attrs, nodes}, ",") end) + + {tag, attrs, nodes} + + other -> + other + end) + end + + def clear_stats_from_attributes(html_tree) do + Floki.traverse_and_update(html_tree, fn + {tag, attrs, nodes} -> + {tag, Keyword.drop(attrs, [:text_length, :commas]), nodes} + + other -> + other + end) + end + + @doc """ + Count only text length. + """ + @spec text_length(html_tree) :: number + def text_length(html_tree) + def text_length(text) when is_binary(text), do: String.length(text) + def text_length(nodes) when is_list(nodes), do: Enum.reduce(nodes, 0, &(&2 + text_length(&1))) + def text_length({:comment, _}), do: 0 + def text_length({"br", _, _}), do: 1 + + def text_length({_tag, attrs, nodes}) do + # we precompute that value + Keyword.get_lazy(attrs, :text_length, fn -> text_length(nodes) end) + end + + @doc """ + Finds number of occurences of a given character, much faster than converting to text + """ + @spec count_character(html_tree, binary) :: number + def count_character(<>, <> = char) do + 1 + count_character(rest, char) + end + + def count_character(<<_::utf8, rest::binary>>, char) do + count_character(rest, char) + end + + def count_character(nodes, char) when is_list(nodes) do + Enum.reduce(nodes, 0, &(&2 + count_character(&1, char))) + end + + def count_character({_tag, attrs, nodes}, ",") do + Keyword.get_lazy(attrs, :commas, fn -> count_character(nodes, ",") end) + end + + def count_character({_tag, _attrs, nodes}, char), do: count_character(nodes, char) + def count_character(_node, _char), do: 0 + + @doc """ + Finds given tags in HTML tree, much faster than using generic selector + """ + @spec find_tag(html_tree, binary) :: list + def find_tag(html_tree, tag), do: html_tree |> find_tag_internal(tag) |> List.flatten() + + def find_tag_internal(nodes, tag) when is_list(nodes), + do: Enum.map(nodes, &find_tag_internal(&1, tag)) + + def find_tag_internal({tag, _, children} = node, tag), + do: [node | find_tag_internal(children, tag)] + + def find_tag_internal({_, _, children}, tag), do: find_tag_internal(children, tag) + def find_tag_internal(_, _), do: [] +end diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex index b68fda6..26f134b 100644 --- a/lib/readability/sanitizer.ex +++ b/lib/readability/sanitizer.ex @@ -9,6 +9,7 @@ defmodule Readability.Sanitizer do alias Readability.Candidate alias Readability.Candidate.Scoring alias Readability.Helper + alias Readability.Queries @type html_tree :: tuple | list @@ -45,23 +46,23 @@ defmodule Readability.Sanitizer do weight + same_tree.score < 0 -> true - length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 -> + Queries.count_character(tree, ",") < 10 -> # If there are not very many commas, and the number of # non-paragraph elements is more than paragraphs or other # ominous signs, remove the element. - p_len = tree |> Floki.find("p") |> length - img_len = tree |> Floki.find("img") |> length - li_len = tree |> Floki.find("li") |> length - input_len = tree |> Floki.find("input") |> length + p_len = tree |> Queries.find_tag("p") |> length + img_len = tree |> Queries.find_tag("img") |> length + li_len = tree |> Queries.find_tag("li") |> length + input_len = tree |> Queries.find_tag("input") |> length embed_len = tree - |> Floki.find("embed") + |> Queries.find_tag("embed") |> Enum.reject(&(&1 =~ Readability.regexes(:video))) |> length link_density = Scoring.calc_link_density(tree) - conent_len = Helper.text_length(tree) + conent_len = Queries.text_length(tree) # too many image # more
  • s than

    s @@ -93,6 +94,6 @@ defmodule Readability.Sanitizer do end defp clean_empty_p?({tag, _, _} = html_tree) do - tag == "p" && Helper.text_length(html_tree) == 0 + tag == "p" && Queries.text_length(html_tree) == 0 end end diff --git a/test/readability/helper_test.exs b/test/readability/helper_test.exs index abd919a..0602310 100644 --- a/test/readability/helper_test.exs +++ b/test/readability/helper_test.exs @@ -43,10 +43,6 @@ defmodule Readability.HelperTest do assert result == expected end - test "inner text length" do - assert Helper.text_length(@html_tree) == 5 - end - test "strips out special case tags" do html = "

    Hello

    " diff --git a/test/readability/queries_test.exs b/test/readability/queries_test.exs new file mode 100644 index 0000000..16749aa --- /dev/null +++ b/test/readability/queries_test.exs @@ -0,0 +1,29 @@ +defmodule Readability.QueriesTest do + use ExUnit.Case, async: true + + alias Readability.Queries + + @sample """ + + +

    + a +

    + abc + +

    +

    +

    + b + alt +

    + + + """ + + @html_tree Floki.parse_fragment!(@sample) + + test "inner text length" do + assert Queries.text_length(@html_tree) == 5 + end +end From 88b29edbc30b4ede2756640c30a020a66b87ca3e Mon Sep 17 00:00:00 2001 From: Valian Date: Wed, 13 Dec 2023 14:03:07 +0100 Subject: [PATCH 2/3] additional tests to increase test coverage --- mix.exs | 6 ++++-- mix.lock | 1 + test/readability/queries_test.exs | 28 +++++++++++++++++++++++++--- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/mix.exs b/mix.exs index 905bedc..f9ed1a8 100644 --- a/mix.exs +++ b/mix.exs @@ -16,7 +16,8 @@ defmodule Readability.Mixfile do coveralls: :test, "coveralls.detail": :test, "coveralls.post": :test, - "coveralls.html": :test + "coveralls.html": :test, + "test.watch": :test ], package: package(), deps: deps(), @@ -36,7 +37,8 @@ defmodule Readability.Mixfile do {:credo, "~> 1.6", only: [:dev, :test]}, {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false}, {:mock, "~> 0.3", only: :test}, - {:excoveralls, "~> 0.18", only: :test} + {:excoveralls, "~> 0.18", only: :test}, + {:mix_test_watch, "~> 1.0", only: [:dev, :test]} ] end diff --git a/mix.lock b/mix.lock index 179d220..c5fc5f6 100644 --- a/mix.lock +++ b/mix.lock @@ -19,6 +19,7 @@ "meck": {:hex, :meck, "0.9.2", "85ccbab053f1db86c7ca240e9fc718170ee5bda03810a6292b5306bf31bae5f5", [:rebar3], [], "hexpm", "81344f561357dc40a8344afa53767c32669153355b626ea9fcbc8da6b3045826"}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"}, + "mix_test_watch": {:hex, :mix_test_watch, "1.1.1", "eee6fc570d77ad6851c7bc08de420a47fd1e449ef5ccfa6a77ef68b72e7e51ad", [:mix], [{:file_system, "~> 0.2.1 or ~> 0.3", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "f82262b54dee533467021723892e15c3267349849f1f737526523ecba4e6baae"}, "mock": {:hex, :mock, "0.3.8", "7046a306b71db2488ef54395eeb74df0a7f335a7caca4a3d3875d1fc81c884dd", [:mix], [{:meck, "~> 0.9.2", [hex: :meck, repo: "hexpm", optional: false]}], "hexpm", "7fa82364c97617d79bb7d15571193fc0c4fe5afd0c932cef09426b3ee6fe2022"}, "nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"}, "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"}, diff --git a/test/readability/queries_test.exs b/test/readability/queries_test.exs index 16749aa..439925e 100644 --- a/test/readability/queries_test.exs +++ b/test/readability/queries_test.exs @@ -7,14 +7,15 @@ defmodule Readability.QueriesTest do

    - a + a

    abc

    - b + This is some text, lalala! + alt

    @@ -24,6 +25,27 @@ defmodule Readability.QueriesTest do @html_tree Floki.parse_fragment!(@sample) test "inner text length" do - assert Queries.text_length(@html_tree) == 5 + assert Queries.text_length(@html_tree) == 30 + assert Floki.text(@html_tree) |> String.length() == 30 + end + + test "inner count characters" do + assert Queries.count_character(@html_tree, ",") == 1 + assert Floki.text(@html_tree) |> String.split(",") |> length() == 1 + 1 + + assert Queries.count_character(@html_tree, "a") == 5 + assert Floki.text(@html_tree) |> String.split("a") |> length() == 5 + 1 + end + + test "inner cached count characters" do + html_tree = Queries.cache_stats_in_attributes(@html_tree) + + assert Queries.count_character(html_tree, ",") == 1 + assert Floki.text(html_tree) |> String.split(",") |> length() == 1 + 1 + + assert Queries.count_character(html_tree, "a") == 5 + assert Floki.text(html_tree) |> String.split("a") |> length() == 5 + 1 + + Queries.clear_stats_from_attributes(html_tree) end end From 1bf2621bdbc93e4edf0117b31f146ae0a3f50b80 Mon Sep 17 00:00:00 2001 From: Valian Date: Wed, 13 Dec 2023 14:29:58 +0100 Subject: [PATCH 3/3] fixed test.watch failing on older Elixir versions --- mix.exs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mix.exs b/mix.exs index f9ed1a8..e584d29 100644 --- a/mix.exs +++ b/mix.exs @@ -30,6 +30,9 @@ defmodule Readability.Mixfile do end defp deps do + # https://github.com/lpil/mix-test.watch/pull/140#issuecomment-1853912030 + test_watch_runtime = match?(["test.watch" | _], System.argv()) + [ {:floki, "~> 0.24"}, {:httpoison, "~> 1.8 or ~> 2.0"}, @@ -38,7 +41,7 @@ defmodule Readability.Mixfile do {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false}, {:mock, "~> 0.3", only: :test}, {:excoveralls, "~> 0.18", only: :test}, - {:mix_test_watch, "~> 1.0", only: [:dev, :test]} + {:mix_test_watch, "~> 1.0", only: [:dev, :test], runtime: test_watch_runtime} ] end