From 46e0e552e59b93177a55d299d9075c7a07d85f7f Mon Sep 17 00:00:00 2001
From: Valian <jakub.skalecki@gmail.com>
Date: Tue, 5 Dec 2023 17:05:08 +0100
Subject: [PATCH 1/3] improved performance by x100 for bigger pages by caching
 queries

---
 lib/readability/article_builder.ex   | 20 ++++---
 lib/readability/candidate/scoring.ex | 27 ++++-----
 lib/readability/candidate_finder.ex  | 12 +++-
 lib/readability/helper.ex            | 19 ------
 lib/readability/queries.ex           | 87 ++++++++++++++++++++++++++++
 lib/readability/sanitizer.ex         | 17 +++---
 test/readability/helper_test.exs     |  4 --
 test/readability/queries_test.exs    | 29 ++++++++++
 8 files changed, 158 insertions(+), 57 deletions(-)
 create mode 100644 lib/readability/queries.ex
 create mode 100644 test/readability/queries_test.exs

diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex
index 56d2817..948f76d 100644
--- a/lib/readability/article_builder.ex
+++ b/lib/readability/article_builder.ex
@@ -8,6 +8,7 @@ defmodule Readability.ArticleBuilder do
   alias Readability.Candidate.Scoring
   alias Readability.CandidateFinder
   alias Readability.Helper
+  alias Readability.Queries
   alias Readability.Sanitizer
 
   @type html_tree :: tuple | list
@@ -35,19 +36,23 @@ defmodule Readability.ArticleBuilder do
 
     html_tree = Cleaner.transform_misused_div_to_p(html_tree)
 
-    candidates = CandidateFinder.find(html_tree, opts)
+    candidates =
+      html_tree
+      |> Queries.cache_stats_in_attributes()
+      |> CandidateFinder.find(opts)
+
     article = find_article(candidates, html_tree)
 
     html_tree = Sanitizer.sanitize(article, candidates, opts)
 
-    if Helper.text_length(html_tree) < opts[:retry_length] do
+    if Queries.text_length(html_tree) < opts[:retry_length] do
       if opts = next_try_opts(opts) do
         build(origin_tree, opts)
       else
-        html_tree
+        Queries.clear_stats_from_attributes(html_tree)
       end
     else
-      html_tree
+      Queries.clear_stats_from_attributes(html_tree)
     end
   end
 
@@ -75,7 +80,7 @@ defmodule Readability.ArticleBuilder do
         find_article_trees(best_candidate, candidates)
       else
         fallback_candidate =
-          case html_tree |> Floki.find("body") do
+          case html_tree |> Queries.find_tag("body") do
             [tree | _] -> %Candidate{html_tree: tree}
             _ -> %Candidate{html_tree: {}}
           end
@@ -99,11 +104,10 @@ defmodule Readability.ArticleBuilder do
 
   defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
     link_density = Scoring.calc_link_density(html_tree)
-    inner_text = html_tree |> Floki.text()
-    inner_length = inner_text |> String.length()
+    inner_length = Queries.text_length(html_tree)
 
     (inner_length > 80 && link_density < 0.25) ||
-      (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
+      (inner_length < 80 && link_density == 0 && Floki.text(html_tree) =~ ~r/\.( |$)/)
   end
 
   defp append?(_), do: false
diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex
index 8e4e06a..ad55ab9 100644
--- a/lib/readability/candidate/scoring.ex
+++ b/lib/readability/candidate/scoring.ex
@@ -2,7 +2,7 @@ defmodule Readability.Candidate.Scoring do
   @moduledoc """
   Score HTML tree.
   """
-  alias Readability.Helper
+  alias Readability.Queries
 
   @element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5}
 
@@ -27,9 +27,8 @@ defmodule Readability.Candidate.Scoring do
 
   defp calc_content_score(html_tree) do
     score = 1
-    inner_text = html_tree |> Floki.text()
-    split_score = inner_text |> String.split(",") |> length
-    length_score = [String.length(inner_text) / 100, 3] |> Enum.min()
+    split_score = Queries.count_character(html_tree, ",") + 1
+    length_score = min(Queries.text_length(html_tree) / 100, 3)
     score + split_score + length_score
   end
 
@@ -58,27 +57,23 @@ defmodule Readability.Candidate.Scoring do
   end
 
   def calc_link_density(html_tree) do
-    link_length =
-      html_tree
-      |> Floki.find("a")
-      |> Floki.text()
-      |> String.length()
-
-    text_length =
-      html_tree
-      |> Floki.text()
-      |> String.length()
+    text_length = Queries.text_length(html_tree)
 
     if text_length == 0 do
       0
     else
+      link_length =
+        html_tree
+        |> Queries.find_tag("a")
+        |> Queries.text_length()
+
       link_length / text_length
     end
   end
 
   defp calc_children_content_score({_, _, children_tree}) do
     children_tree
-    |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+    |> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1)))
     |> calc_content_score
   end
 
@@ -88,7 +83,7 @@ defmodule Readability.Candidate.Scoring do
       |> Enum.filter(&is_tuple(&1))
       |> Enum.map(&elem(&1, 2))
       |> List.flatten()
-      |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+      |> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1)))
       |> calc_content_score
 
     score / 2
diff --git a/lib/readability/candidate_finder.ex b/lib/readability/candidate_finder.ex
index 3099892..8d10653 100644
--- a/lib/readability/candidate_finder.ex
+++ b/lib/readability/candidate_finder.ex
@@ -7,7 +7,7 @@ defmodule Readability.CandidateFinder do
 
   alias Readability.Candidate
   alias Readability.Candidate.Scoring
-  alias Readability.Helper
+  alias Readability.Queries
 
   @type html_tree :: tuple | list
   @type options :: list
@@ -53,6 +53,14 @@ defmodule Readability.CandidateFinder do
     |> Enum.max_by(fn candidate -> candidate.score end)
   end
 
+  @doc """
+  Check `html_tree` can be candidate or not.
+  """
+  @spec candidate_tag?(html_tree) :: boolean
+  def candidate_tag?({tag, _, _} = html_tree) do
+    (tag == "p" || tag == "td") && Queries.text_length(html_tree) >= 25
+  end
+
   defp candidate?(_, depth \\ 0)
   defp candidate?(_, depth) when depth > 2, do: false
   defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth)
@@ -60,7 +68,7 @@ defmodule Readability.CandidateFinder do
   defp candidate?(text, _) when is_binary(text), do: false
 
   defp candidate?({_, _, inner_tree} = html_tree, depth) do
-    if Helper.candidate_tag?(html_tree) do
+    if candidate_tag?(html_tree) do
       true
     else
       candidate?(inner_tree, depth + 1)
diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
index 315de37..1f594c3 100644
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@@ -80,25 +80,6 @@ defmodule Readability.Helper do
     end
   end
 
-  @doc """
-  Count only text length.
-  """
-  @spec text_length(html_tree) :: number
-  def text_length(html_tree) do
-    html_tree |> Floki.text() |> String.trim() |> String.length()
-  end
-
-  @doc """
-  Check `html_tree` can be candidate or not.
-  """
-  @spec candidate_tag?(html_tree) :: boolean
-  def candidate_tag?({tag, _, _} = html_tree) do
-    Enum.any?(["p", "td"], fn candidate_tag ->
-      tag == candidate_tag &&
-        text_length(html_tree) >= Readability.default_options()[:min_text_length]
-    end)
-  end
-
   @doc """
   Normalizes and parses to HTML tree (tuple or list)) from binary HTML.
   """
diff --git a/lib/readability/queries.ex b/lib/readability/queries.ex
new file mode 100644
index 0000000..45d4eee
--- /dev/null
+++ b/lib/readability/queries.ex
@@ -0,0 +1,87 @@
+defmodule Readability.Queries do
+  @moduledoc """
+  Highly-optimized utilities for quick answers about HTML tree
+  """
+
+  @type html_tree :: tuple | list
+  @type options :: list
+
+  def cache_stats_in_attributes(html_tree) do
+    Floki.traverse_and_update(html_tree, fn
+      {tag, attrs, nodes} ->
+        attrs =
+          Keyword.put_new_lazy(attrs, :text_length, fn -> text_length({tag, attrs, nodes}) end)
+
+        attrs =
+          Keyword.put_new_lazy(attrs, :commas, fn -> count_character({tag, attrs, nodes}, ",") end)
+
+        {tag, attrs, nodes}
+
+      other ->
+        other
+    end)
+  end
+
+  def clear_stats_from_attributes(html_tree) do
+    Floki.traverse_and_update(html_tree, fn
+      {tag, attrs, nodes} ->
+        {tag, Keyword.drop(attrs, [:text_length, :commas]), nodes}
+
+      other ->
+        other
+    end)
+  end
+
+  @doc """
+  Count only text length.
+  """
+  @spec text_length(html_tree) :: number
+  def text_length(html_tree)
+  def text_length(text) when is_binary(text), do: String.length(text)
+  def text_length(nodes) when is_list(nodes), do: Enum.reduce(nodes, 0, &(&2 + text_length(&1)))
+  def text_length({:comment, _}), do: 0
+  def text_length({"br", _, _}), do: 1
+
+  def text_length({_tag, attrs, nodes}) do
+    # we precompute that value
+    Keyword.get_lazy(attrs, :text_length, fn -> text_length(nodes) end)
+  end
+
+  @doc """
+  Finds number of occurences of a given character, much faster than converting to text
+  """
+  @spec count_character(html_tree, binary) :: number
+  def count_character(<<v::utf8, rest::binary>>, <<v::utf8>> = char) do
+    1 + count_character(rest, char)
+  end
+
+  def count_character(<<_::utf8, rest::binary>>, char) do
+    count_character(rest, char)
+  end
+
+  def count_character(nodes, char) when is_list(nodes) do
+    Enum.reduce(nodes, 0, &(&2 + count_character(&1, char)))
+  end
+
+  def count_character({_tag, attrs, nodes}, ",") do
+    Keyword.get_lazy(attrs, :commas, fn -> count_character(nodes, ",") end)
+  end
+
+  def count_character({_tag, _attrs, nodes}, char), do: count_character(nodes, char)
+  def count_character(_node, _char), do: 0
+
+  @doc """
+  Finds given tags in HTML tree, much faster than using generic selector
+  """
+  @spec find_tag(html_tree, binary) :: list
+  def find_tag(html_tree, tag), do: html_tree |> find_tag_internal(tag) |> List.flatten()
+
+  def find_tag_internal(nodes, tag) when is_list(nodes),
+    do: Enum.map(nodes, &find_tag_internal(&1, tag))
+
+  def find_tag_internal({tag, _, children} = node, tag),
+    do: [node | find_tag_internal(children, tag)]
+
+  def find_tag_internal({_, _, children}, tag), do: find_tag_internal(children, tag)
+  def find_tag_internal(_, _), do: []
+end
diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex
index b68fda6..26f134b 100644
--- a/lib/readability/sanitizer.ex
+++ b/lib/readability/sanitizer.ex
@@ -9,6 +9,7 @@ defmodule Readability.Sanitizer do
   alias Readability.Candidate
   alias Readability.Candidate.Scoring
   alias Readability.Helper
+  alias Readability.Queries
 
   @type html_tree :: tuple | list
 
@@ -45,23 +46,23 @@ defmodule Readability.Sanitizer do
           weight + same_tree.score < 0 ->
             true
 
-          length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
+          Queries.count_character(tree, ",") < 10 ->
             # If there are not very many commas, and the number of
             # non-paragraph elements is more than paragraphs or other
             # ominous signs, remove the element.
-            p_len = tree |> Floki.find("p") |> length
-            img_len = tree |> Floki.find("img") |> length
-            li_len = tree |> Floki.find("li") |> length
-            input_len = tree |> Floki.find("input") |> length
+            p_len = tree |> Queries.find_tag("p") |> length
+            img_len = tree |> Queries.find_tag("img") |> length
+            li_len = tree |> Queries.find_tag("li") |> length
+            input_len = tree |> Queries.find_tag("input") |> length
 
             embed_len =
               tree
-              |> Floki.find("embed")
+              |> Queries.find_tag("embed")
               |> Enum.reject(&(&1 =~ Readability.regexes(:video)))
               |> length
 
             link_density = Scoring.calc_link_density(tree)
-            conent_len = Helper.text_length(tree)
+            conent_len = Queries.text_length(tree)
 
             # too many image
             # more <li>s than <p>s
@@ -93,6 +94,6 @@ defmodule Readability.Sanitizer do
   end
 
   defp clean_empty_p?({tag, _, _} = html_tree) do
-    tag == "p" && Helper.text_length(html_tree) == 0
+    tag == "p" && Queries.text_length(html_tree) == 0
   end
 end
diff --git a/test/readability/helper_test.exs b/test/readability/helper_test.exs
index abd919a..0602310 100644
--- a/test/readability/helper_test.exs
+++ b/test/readability/helper_test.exs
@@ -43,10 +43,6 @@ defmodule Readability.HelperTest do
     assert result == expected
   end
 
-  test "inner text length" do
-    assert Helper.text_length(@html_tree) == 5
-  end
-
   test "strips out special case tags" do
     html =
       "<html><body><p>Hello <? echo esc_html( wired_get_the_byline_name( $related_video ) ); ?></p></body></html>"
diff --git a/test/readability/queries_test.exs b/test/readability/queries_test.exs
new file mode 100644
index 0000000..16749aa
--- /dev/null
+++ b/test/readability/queries_test.exs
@@ -0,0 +1,29 @@
+defmodule Readability.QueriesTest do
+  use ExUnit.Case, async: true
+
+  alias Readability.Queries
+
+  @sample """
+    <html>
+      <body>
+        <p>
+          <font>a</fond>
+          <p>
+            <font>abc</font>
+            <img src="https://example.org/images/foo.png">
+          </p>
+        </p>
+        <p>
+          <font>b</font>
+          <img class="img" src="/images/bar.png" alt="alt" />
+        </p>
+      </body>
+    </html>
+  """
+
+  @html_tree Floki.parse_fragment!(@sample)
+
+  test "inner text length" do
+    assert Queries.text_length(@html_tree) == 5
+  end
+end

From 88b29edbc30b4ede2756640c30a020a66b87ca3e Mon Sep 17 00:00:00 2001
From: Valian <jakub.skalecki@gmail.com>
Date: Wed, 13 Dec 2023 14:03:07 +0100
Subject: [PATCH 2/3] additional tests to increase test coverage

---
 mix.exs                           |  6 ++++--
 mix.lock                          |  1 +
 test/readability/queries_test.exs | 28 +++++++++++++++++++++++++---
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/mix.exs b/mix.exs
index 905bedc..f9ed1a8 100644
--- a/mix.exs
+++ b/mix.exs
@@ -16,7 +16,8 @@ defmodule Readability.Mixfile do
         coveralls: :test,
         "coveralls.detail": :test,
         "coveralls.post": :test,
-        "coveralls.html": :test
+        "coveralls.html": :test,
+        "test.watch": :test
       ],
       package: package(),
       deps: deps(),
@@ -36,7 +37,8 @@ defmodule Readability.Mixfile do
       {:credo, "~> 1.6", only: [:dev, :test]},
       {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false},
       {:mock, "~> 0.3", only: :test},
-      {:excoveralls, "~> 0.18", only: :test}
+      {:excoveralls, "~> 0.18", only: :test},
+      {:mix_test_watch, "~> 1.0", only: [:dev, :test]}
     ]
   end
 
diff --git a/mix.lock b/mix.lock
index 179d220..c5fc5f6 100644
--- a/mix.lock
+++ b/mix.lock
@@ -19,6 +19,7 @@
   "meck": {:hex, :meck, "0.9.2", "85ccbab053f1db86c7ca240e9fc718170ee5bda03810a6292b5306bf31bae5f5", [:rebar3], [], "hexpm", "81344f561357dc40a8344afa53767c32669153355b626ea9fcbc8da6b3045826"},
   "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
   "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
+  "mix_test_watch": {:hex, :mix_test_watch, "1.1.1", "eee6fc570d77ad6851c7bc08de420a47fd1e449ef5ccfa6a77ef68b72e7e51ad", [:mix], [{:file_system, "~> 0.2.1 or ~> 0.3", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "f82262b54dee533467021723892e15c3267349849f1f737526523ecba4e6baae"},
   "mock": {:hex, :mock, "0.3.8", "7046a306b71db2488ef54395eeb74df0a7f335a7caca4a3d3875d1fc81c884dd", [:mix], [{:meck, "~> 0.9.2", [hex: :meck, repo: "hexpm", optional: false]}], "hexpm", "7fa82364c97617d79bb7d15571193fc0c4fe5afd0c932cef09426b3ee6fe2022"},
   "nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"},
   "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
diff --git a/test/readability/queries_test.exs b/test/readability/queries_test.exs
index 16749aa..439925e 100644
--- a/test/readability/queries_test.exs
+++ b/test/readability/queries_test.exs
@@ -7,14 +7,15 @@ defmodule Readability.QueriesTest do
     <html>
       <body>
         <p>
-          <font>a</fond>
+          <font>a</font>
           <p>
             <font>abc</font>
             <img src="https://example.org/images/foo.png">
           </p>
         </p>
         <p>
-          <font>b</font>
+          <span>This is some text, lalala!</span>
+          <!-- some comment to test -->
           <img class="img" src="/images/bar.png" alt="alt" />
         </p>
       </body>
@@ -24,6 +25,27 @@ defmodule Readability.QueriesTest do
   @html_tree Floki.parse_fragment!(@sample)
 
   test "inner text length" do
-    assert Queries.text_length(@html_tree) == 5
+    assert Queries.text_length(@html_tree) == 30
+    assert Floki.text(@html_tree) |> String.length() == 30
+  end
+
+  test "inner count characters" do
+    assert Queries.count_character(@html_tree, ",") == 1
+    assert Floki.text(@html_tree) |> String.split(",") |> length() == 1 + 1
+
+    assert Queries.count_character(@html_tree, "a") == 5
+    assert Floki.text(@html_tree) |> String.split("a") |> length() == 5 + 1
+  end
+
+  test "inner cached count characters" do
+    html_tree = Queries.cache_stats_in_attributes(@html_tree)
+
+    assert Queries.count_character(html_tree, ",") == 1
+    assert Floki.text(html_tree) |> String.split(",") |> length() == 1 + 1
+
+    assert Queries.count_character(html_tree, "a") == 5
+    assert Floki.text(html_tree) |> String.split("a") |> length() == 5 + 1
+
+    Queries.clear_stats_from_attributes(html_tree)
   end
 end

From 1bf2621bdbc93e4edf0117b31f146ae0a3f50b80 Mon Sep 17 00:00:00 2001
From: Valian <jakub.skalecki@gmail.com>
Date: Wed, 13 Dec 2023 14:29:58 +0100
Subject: [PATCH 3/3] fixed test.watch failing on older Elixir versions

---
 mix.exs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mix.exs b/mix.exs
index f9ed1a8..e584d29 100644
--- a/mix.exs
+++ b/mix.exs
@@ -30,6 +30,9 @@ defmodule Readability.Mixfile do
   end
 
   defp deps do
+    # https://github.com/lpil/mix-test.watch/pull/140#issuecomment-1853912030
+    test_watch_runtime = match?(["test.watch" | _], System.argv())
+
     [
       {:floki, "~> 0.24"},
       {:httpoison, "~> 1.8 or ~> 2.0"},
@@ -38,7 +41,7 @@ defmodule Readability.Mixfile do
       {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false},
       {:mock, "~> 0.3", only: :test},
       {:excoveralls, "~> 0.18", only: :test},
-      {:mix_test_watch, "~> 1.0", only: [:dev, :test]}
+      {:mix_test_watch, "~> 1.0", only: [:dev, :test], runtime: test_watch_runtime}
     ]
   end