Improve performance x100 for bigger pages (#53)

* improved performance by x100 for bigger pages by caching queries * additional tests to increase test coverage * fixed test.watch failing on older Elixir versions
keepcosmos · Dec 13, 2023 · b1a6a0e · b1a6a0e
1 parent f5fbbc0
commit b1a6a0e
Show file tree

Hide file tree

Showing 10 changed files with 188 additions and 59 deletions.
diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex
@@ -8,6 +8,7 @@ defmodule Readability.ArticleBuilder do
   alias Readability.Candidate.Scoring
   alias Readability.CandidateFinder
   alias Readability.Helper
+  alias Readability.Queries
   alias Readability.Sanitizer
 
   @type html_tree :: tuple | list
@@ -35,19 +36,23 @@ defmodule Readability.ArticleBuilder do
 
     html_tree = Cleaner.transform_misused_div_to_p(html_tree)
 
-    candidates = CandidateFinder.find(html_tree, opts)
+    candidates =
+      html_tree
+      |> Queries.cache_stats_in_attributes()
+      |> CandidateFinder.find(opts)
+
     article = find_article(candidates, html_tree)
 
     html_tree = Sanitizer.sanitize(article, candidates, opts)
 
-    if Helper.text_length(html_tree) < opts[:retry_length] do
+    if Queries.text_length(html_tree) < opts[:retry_length] do
       if opts = next_try_opts(opts) do
         build(origin_tree, opts)
       else
-        html_tree
+        Queries.clear_stats_from_attributes(html_tree)
       end
     else
-      html_tree
+      Queries.clear_stats_from_attributes(html_tree)
     end
   end
 
@@ -75,7 +80,7 @@ defmodule Readability.ArticleBuilder do
         find_article_trees(best_candidate, candidates)
       else
         fallback_candidate =
-          case html_tree |> Floki.find("body") do
+          case html_tree |> Queries.find_tag("body") do
             [tree | _] -> %Candidate{html_tree: tree}
             _ -> %Candidate{html_tree: {}}
           end
@@ -99,11 +104,10 @@ defmodule Readability.ArticleBuilder do
 
   defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
     link_density = Scoring.calc_link_density(html_tree)
-    inner_text = html_tree |> Floki.text()
-    inner_length = inner_text |> String.length()
+    inner_length = Queries.text_length(html_tree)
 
     (inner_length > 80 && link_density < 0.25) ||
-      (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
+      (inner_length < 80 && link_density == 0 && Floki.text(html_tree) =~ ~r/\.( |$)/)
   end
 
   defp append?(_), do: false

diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex
@@ -2,7 +2,7 @@ defmodule Readability.Candidate.Scoring do
   @moduledoc """
   Score HTML tree.
   """
-  alias Readability.Helper
+  alias Readability.Queries
 
   @element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5}
 
@@ -27,9 +27,8 @@ defmodule Readability.Candidate.Scoring do
 
   defp calc_content_score(html_tree) do
     score = 1
-    inner_text = html_tree |> Floki.text()
-    split_score = inner_text |> String.split(",") |> length
-    length_score = [String.length(inner_text) / 100, 3] |> Enum.min()
+    split_score = Queries.count_character(html_tree, ",") + 1
+    length_score = min(Queries.text_length(html_tree) / 100, 3)
     score + split_score + length_score
   end
 
@@ -58,27 +57,23 @@ defmodule Readability.Candidate.Scoring do
   end
 
   def calc_link_density(html_tree) do
-    link_length =
-      html_tree
-      |> Floki.find("a")
-      |> Floki.text()
-      |> String.length()
-
-    text_length =
-      html_tree
-      |> Floki.text()
-      |> String.length()
+    text_length = Queries.text_length(html_tree)
 
     if text_length == 0 do
       0
     else
+      link_length =
+        html_tree
+        |> Queries.find_tag("a")
+        |> Queries.text_length()
+
       link_length / text_length
     end
   end
 
   defp calc_children_content_score({_, _, children_tree}) do
     children_tree
-    |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+    |> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1)))
     |> calc_content_score
   end
 
@@ -88,7 +83,7 @@ defmodule Readability.Candidate.Scoring do
       |> Enum.filter(&is_tuple(&1))
       |> Enum.map(&elem(&1, 2))
       |> List.flatten()
-      |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+      |> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1)))
       |> calc_content_score
 
     score / 2

diff --git a/lib/readability/candidate_finder.ex b/lib/readability/candidate_finder.ex
@@ -7,7 +7,7 @@ defmodule Readability.CandidateFinder do
 
   alias Readability.Candidate
   alias Readability.Candidate.Scoring
-  alias Readability.Helper
+  alias Readability.Queries
 
   @type html_tree :: tuple | list
   @type options :: list
@@ -53,14 +53,22 @@ defmodule Readability.CandidateFinder do
     |> Enum.max_by(fn candidate -> candidate.score end)
   end
 
+  @doc """
+  Check `html_tree` can be candidate or not.
+  """
+  @spec candidate_tag?(html_tree) :: boolean
+  def candidate_tag?({tag, _, _} = html_tree) do
+    (tag == "p" || tag == "td") && Queries.text_length(html_tree) >= 25
+  end
+
   defp candidate?(_, depth \\ 0)
   defp candidate?(_, depth) when depth > 2, do: false
   defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth)
   defp candidate?([], _), do: false
   defp candidate?(text, _) when is_binary(text), do: false
 
   defp candidate?({_, _, inner_tree} = html_tree, depth) do
-    if Helper.candidate_tag?(html_tree) do
+    if candidate_tag?(html_tree) do
       true
     else
       candidate?(inner_tree, depth + 1)

diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
@@ -80,25 +80,6 @@ defmodule Readability.Helper do
     end
   end
 
-  @doc """
-  Count only text length.
-  """
-  @spec text_length(html_tree) :: number
-  def text_length(html_tree) do
-    html_tree |> Floki.text() |> String.trim() |> String.length()
-  end
-
-  @doc """
-  Check `html_tree` can be candidate or not.
-  """
-  @spec candidate_tag?(html_tree) :: boolean
-  def candidate_tag?({tag, _, _} = html_tree) do
-    Enum.any?(["p", "td"], fn candidate_tag ->
-      tag == candidate_tag &&
-        text_length(html_tree) >= Readability.default_options()[:min_text_length]
-    end)
-  end
-
   @doc """
   Normalizes and parses to HTML tree (tuple or list)) from binary HTML.
   """

diff --git a/lib/readability/queries.ex b/lib/readability/queries.ex
@@ -0,0 +1,87 @@
+defmodule Readability.Queries do
+  @moduledoc """
+  Highly-optimized utilities for quick answers about HTML tree
+  """
+
+  @type html_tree :: tuple | list
+  @type options :: list
+
+  def cache_stats_in_attributes(html_tree) do
+    Floki.traverse_and_update(html_tree, fn
+      {tag, attrs, nodes} ->
+        attrs =
+          Keyword.put_new_lazy(attrs, :text_length, fn -> text_length({tag, attrs, nodes}) end)
+
+        attrs =
+          Keyword.put_new_lazy(attrs, :commas, fn -> count_character({tag, attrs, nodes}, ",") end)
+
+        {tag, attrs, nodes}
+
+      other ->
+        other
+    end)
+  end
+
+  def clear_stats_from_attributes(html_tree) do
+    Floki.traverse_and_update(html_tree, fn
+      {tag, attrs, nodes} ->
+        {tag, Keyword.drop(attrs, [:text_length, :commas]), nodes}
+
+      other ->
+        other
+    end)
+  end
+
+  @doc """
+  Count only text length.
+  """
+  @spec text_length(html_tree) :: number
+  def text_length(html_tree)
+  def text_length(text) when is_binary(text), do: String.length(text)
+  def text_length(nodes) when is_list(nodes), do: Enum.reduce(nodes, 0, &(&2 + text_length(&1)))
+  def text_length({:comment, _}), do: 0
+  def text_length({"br", _, _}), do: 1
+
+  def text_length({_tag, attrs, nodes}) do
+    # we precompute that value
+    Keyword.get_lazy(attrs, :text_length, fn -> text_length(nodes) end)
+  end
+
+  @doc """
+  Finds number of occurences of a given character, much faster than converting to text
+  """
+  @spec count_character(html_tree, binary) :: number
+  def count_character(<<v::utf8, rest::binary>>, <<v::utf8>> = char) do
+    1 + count_character(rest, char)
+  end
+
+  def count_character(<<_::utf8, rest::binary>>, char) do
+    count_character(rest, char)
+  end
+
+  def count_character(nodes, char) when is_list(nodes) do
+    Enum.reduce(nodes, 0, &(&2 + count_character(&1, char)))
+  end
+
+  def count_character({_tag, attrs, nodes}, ",") do
+    Keyword.get_lazy(attrs, :commas, fn -> count_character(nodes, ",") end)
+  end
+
+  def count_character({_tag, _attrs, nodes}, char), do: count_character(nodes, char)
+  def count_character(_node, _char), do: 0
+
+  @doc """
+  Finds given tags in HTML tree, much faster than using generic selector
+  """
+  @spec find_tag(html_tree, binary) :: list
+  def find_tag(html_tree, tag), do: html_tree |> find_tag_internal(tag) |> List.flatten()
+
+  def find_tag_internal(nodes, tag) when is_list(nodes),
+    do: Enum.map(nodes, &find_tag_internal(&1, tag))
+
+  def find_tag_internal({tag, _, children} = node, tag),
+    do: [node | find_tag_internal(children, tag)]
+
+  def find_tag_internal({_, _, children}, tag), do: find_tag_internal(children, tag)
+  def find_tag_internal(_, _), do: []
+end
diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex
@@ -9,6 +9,7 @@ defmodule Readability.Sanitizer do
   alias Readability.Candidate
   alias Readability.Candidate.Scoring
   alias Readability.Helper
+  alias Readability.Queries
 
   @type html_tree :: tuple | list
 
@@ -45,23 +46,23 @@ defmodule Readability.Sanitizer do
           weight + same_tree.score < 0 ->
             true
 
-          length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
+          Queries.count_character(tree, ",") < 10 ->
             # If there are not very many commas, and the number of
             # non-paragraph elements is more than paragraphs or other
             # ominous signs, remove the element.
-            p_len = tree |> Floki.find("p") |> length
-            img_len = tree |> Floki.find("img") |> length
-            li_len = tree |> Floki.find("li") |> length
-            input_len = tree |> Floki.find("input") |> length
+            p_len = tree |> Queries.find_tag("p") |> length
+            img_len = tree |> Queries.find_tag("img") |> length
+            li_len = tree |> Queries.find_tag("li") |> length
+            input_len = tree |> Queries.find_tag("input") |> length
 
             embed_len =
               tree
-              |> Floki.find("embed")
+              |> Queries.find_tag("embed")
               |> Enum.reject(&(&1 =~ Readability.regexes(:video)))
               |> length
 
             link_density = Scoring.calc_link_density(tree)
-            conent_len = Helper.text_length(tree)
+            conent_len = Queries.text_length(tree)
 
             # too many image
             # more <li>s than <p>s
@@ -93,6 +94,6 @@ defmodule Readability.Sanitizer do
   end
 
   defp clean_empty_p?({tag, _, _} = html_tree) do
-    tag == "p" && Helper.text_length(html_tree) == 0
+    tag == "p" && Queries.text_length(html_tree) == 0
   end
 end
diff --git a/mix.exs b/mix.exs
@@ -16,7 +16,8 @@ defmodule Readability.Mixfile do
         coveralls: :test,
         "coveralls.detail": :test,
         "coveralls.post": :test,
-        "coveralls.html": :test
+        "coveralls.html": :test,
+        "test.watch": :test
       ],
       package: package(),
       deps: deps(),
@@ -29,14 +30,18 @@ defmodule Readability.Mixfile do
   end
 
   defp deps do
+    # https://github.com/lpil/mix-test.watch/pull/140#issuecomment-1853912030
+    test_watch_runtime = match?(["test.watch" | _], System.argv())
+
     [
       {:floki, "~> 0.24"},
       {:httpoison, "~> 1.8 or ~> 2.0"},
       {:ex_doc, "~> 0.29", only: :dev},
       {:credo, "~> 1.6", only: [:dev, :test]},
       {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false},
       {:mock, "~> 0.3", only: :test},
-      {:excoveralls, "~> 0.18", only: :test}
+      {:excoveralls, "~> 0.18", only: :test},
+      {:mix_test_watch, "~> 1.0", only: [:dev, :test], runtime: test_watch_runtime}
     ]
   end
 

diff --git a/mix.lock b/mix.lock
@@ -19,6 +19,7 @@
   "meck": {:hex, :meck, "0.9.2", "85ccbab053f1db86c7ca240e9fc718170ee5bda03810a6292b5306bf31bae5f5", [:rebar3], [], "hexpm", "81344f561357dc40a8344afa53767c32669153355b626ea9fcbc8da6b3045826"},
   "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
   "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
+  "mix_test_watch": {:hex, :mix_test_watch, "1.1.1", "eee6fc570d77ad6851c7bc08de420a47fd1e449ef5ccfa6a77ef68b72e7e51ad", [:mix], [{:file_system, "~> 0.2.1 or ~> 0.3", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "f82262b54dee533467021723892e15c3267349849f1f737526523ecba4e6baae"},
   "mock": {:hex, :mock, "0.3.8", "7046a306b71db2488ef54395eeb74df0a7f335a7caca4a3d3875d1fc81c884dd", [:mix], [{:meck, "~> 0.9.2", [hex: :meck, repo: "hexpm", optional: false]}], "hexpm", "7fa82364c97617d79bb7d15571193fc0c4fe5afd0c932cef09426b3ee6fe2022"},
   "nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"},
   "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},

diff --git a/test/readability/helper_test.exs b/test/readability/helper_test.exs
@@ -43,10 +43,6 @@ defmodule Readability.HelperTest do
     assert result == expected
   end
 
-  test "inner text length" do
-    assert Helper.text_length(@html_tree) == 5
-  end
-
   test "strips out special case tags" do
     html =
       "<html><body><p>Hello <? echo esc_html( wired_get_the_byline_name( $related_video ) ); ?></p></body></html>"