Skip to content

Commit

Permalink
Improve performance x100 for bigger pages (#53)
Browse files Browse the repository at this point in the history
* improved performance by x100 for bigger pages by caching queries

* additional tests to increase test coverage

* fixed test.watch failing on older Elixir versions
  • Loading branch information
Valian authored Dec 13, 2023
1 parent f5fbbc0 commit b1a6a0e
Show file tree
Hide file tree
Showing 10 changed files with 188 additions and 59 deletions.
20 changes: 12 additions & 8 deletions lib/readability/article_builder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ defmodule Readability.ArticleBuilder do
alias Readability.Candidate.Scoring
alias Readability.CandidateFinder
alias Readability.Helper
alias Readability.Queries
alias Readability.Sanitizer

@type html_tree :: tuple | list
Expand Down Expand Up @@ -35,19 +36,23 @@ defmodule Readability.ArticleBuilder do

html_tree = Cleaner.transform_misused_div_to_p(html_tree)

candidates = CandidateFinder.find(html_tree, opts)
candidates =
html_tree
|> Queries.cache_stats_in_attributes()
|> CandidateFinder.find(opts)

article = find_article(candidates, html_tree)

html_tree = Sanitizer.sanitize(article, candidates, opts)

if Helper.text_length(html_tree) < opts[:retry_length] do
if Queries.text_length(html_tree) < opts[:retry_length] do
if opts = next_try_opts(opts) do
build(origin_tree, opts)
else
html_tree
Queries.clear_stats_from_attributes(html_tree)
end
else
html_tree
Queries.clear_stats_from_attributes(html_tree)
end
end

Expand Down Expand Up @@ -75,7 +80,7 @@ defmodule Readability.ArticleBuilder do
find_article_trees(best_candidate, candidates)
else
fallback_candidate =
case html_tree |> Floki.find("body") do
case html_tree |> Queries.find_tag("body") do
[tree | _] -> %Candidate{html_tree: tree}
_ -> %Candidate{html_tree: {}}
end
Expand All @@ -99,11 +104,10 @@ defmodule Readability.ArticleBuilder do

defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
link_density = Scoring.calc_link_density(html_tree)
inner_text = html_tree |> Floki.text()
inner_length = inner_text |> String.length()
inner_length = Queries.text_length(html_tree)

(inner_length > 80 && link_density < 0.25) ||
(inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
(inner_length < 80 && link_density == 0 && Floki.text(html_tree) =~ ~r/\.( |$)/)
end

defp append?(_), do: false
Expand Down
27 changes: 11 additions & 16 deletions lib/readability/candidate/scoring.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Readability.Candidate.Scoring do
@moduledoc """
Score HTML tree.
"""
alias Readability.Helper
alias Readability.Queries

@element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5}

Expand All @@ -27,9 +27,8 @@ defmodule Readability.Candidate.Scoring do

defp calc_content_score(html_tree) do
score = 1
inner_text = html_tree |> Floki.text()
split_score = inner_text |> String.split(",") |> length
length_score = [String.length(inner_text) / 100, 3] |> Enum.min()
split_score = Queries.count_character(html_tree, ",") + 1
length_score = min(Queries.text_length(html_tree) / 100, 3)
score + split_score + length_score
end

Expand Down Expand Up @@ -58,27 +57,23 @@ defmodule Readability.Candidate.Scoring do
end

def calc_link_density(html_tree) do
link_length =
html_tree
|> Floki.find("a")
|> Floki.text()
|> String.length()

text_length =
html_tree
|> Floki.text()
|> String.length()
text_length = Queries.text_length(html_tree)

if text_length == 0 do
0
else
link_length =
html_tree
|> Queries.find_tag("a")
|> Queries.text_length()

link_length / text_length
end
end

defp calc_children_content_score({_, _, children_tree}) do
children_tree
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1)))
|> calc_content_score
end

Expand All @@ -88,7 +83,7 @@ defmodule Readability.Candidate.Scoring do
|> Enum.filter(&is_tuple(&1))
|> Enum.map(&elem(&1, 2))
|> List.flatten()
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1)))
|> calc_content_score

score / 2
Expand Down
12 changes: 10 additions & 2 deletions lib/readability/candidate_finder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ defmodule Readability.CandidateFinder do

alias Readability.Candidate
alias Readability.Candidate.Scoring
alias Readability.Helper
alias Readability.Queries

@type html_tree :: tuple | list
@type options :: list
Expand Down Expand Up @@ -53,14 +53,22 @@ defmodule Readability.CandidateFinder do
|> Enum.max_by(fn candidate -> candidate.score end)
end

@doc """
Check `html_tree` can be candidate or not.
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?({tag, _, _} = html_tree) do
(tag == "p" || tag == "td") && Queries.text_length(html_tree) >= 25
end

defp candidate?(_, depth \\ 0)
defp candidate?(_, depth) when depth > 2, do: false
defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth)
defp candidate?([], _), do: false
defp candidate?(text, _) when is_binary(text), do: false

defp candidate?({_, _, inner_tree} = html_tree, depth) do
if Helper.candidate_tag?(html_tree) do
if candidate_tag?(html_tree) do
true
else
candidate?(inner_tree, depth + 1)
Expand Down
19 changes: 0 additions & 19 deletions lib/readability/helper.ex
Original file line number Diff line number Diff line change
Expand Up @@ -80,25 +80,6 @@ defmodule Readability.Helper do
end
end

@doc """
Count only text length.
"""
@spec text_length(html_tree) :: number
def text_length(html_tree) do
html_tree |> Floki.text() |> String.trim() |> String.length()
end

@doc """
Check `html_tree` can be candidate or not.
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?({tag, _, _} = html_tree) do
Enum.any?(["p", "td"], fn candidate_tag ->
tag == candidate_tag &&
text_length(html_tree) >= Readability.default_options()[:min_text_length]
end)
end

@doc """
Normalizes and parses to HTML tree (tuple or list)) from binary HTML.
"""
Expand Down
87 changes: 87 additions & 0 deletions lib/readability/queries.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
defmodule Readability.Queries do
@moduledoc """
Highly-optimized utilities for quick answers about HTML tree
"""

@type html_tree :: tuple | list
@type options :: list

def cache_stats_in_attributes(html_tree) do
Floki.traverse_and_update(html_tree, fn
{tag, attrs, nodes} ->
attrs =
Keyword.put_new_lazy(attrs, :text_length, fn -> text_length({tag, attrs, nodes}) end)

attrs =
Keyword.put_new_lazy(attrs, :commas, fn -> count_character({tag, attrs, nodes}, ",") end)

{tag, attrs, nodes}

other ->
other
end)
end

def clear_stats_from_attributes(html_tree) do
Floki.traverse_and_update(html_tree, fn
{tag, attrs, nodes} ->
{tag, Keyword.drop(attrs, [:text_length, :commas]), nodes}

other ->
other
end)
end

@doc """
Count only text length.
"""
@spec text_length(html_tree) :: number
def text_length(html_tree)
def text_length(text) when is_binary(text), do: String.length(text)
def text_length(nodes) when is_list(nodes), do: Enum.reduce(nodes, 0, &(&2 + text_length(&1)))
def text_length({:comment, _}), do: 0
def text_length({"br", _, _}), do: 1

def text_length({_tag, attrs, nodes}) do
# we precompute that value
Keyword.get_lazy(attrs, :text_length, fn -> text_length(nodes) end)
end

@doc """
Finds number of occurences of a given character, much faster than converting to text
"""
@spec count_character(html_tree, binary) :: number
def count_character(<<v::utf8, rest::binary>>, <<v::utf8>> = char) do
1 + count_character(rest, char)
end

def count_character(<<_::utf8, rest::binary>>, char) do
count_character(rest, char)
end

def count_character(nodes, char) when is_list(nodes) do
Enum.reduce(nodes, 0, &(&2 + count_character(&1, char)))
end

def count_character({_tag, attrs, nodes}, ",") do
Keyword.get_lazy(attrs, :commas, fn -> count_character(nodes, ",") end)
end

def count_character({_tag, _attrs, nodes}, char), do: count_character(nodes, char)
def count_character(_node, _char), do: 0

@doc """
Finds given tags in HTML tree, much faster than using generic selector
"""
@spec find_tag(html_tree, binary) :: list
def find_tag(html_tree, tag), do: html_tree |> find_tag_internal(tag) |> List.flatten()

def find_tag_internal(nodes, tag) when is_list(nodes),
do: Enum.map(nodes, &find_tag_internal(&1, tag))

def find_tag_internal({tag, _, children} = node, tag),
do: [node | find_tag_internal(children, tag)]

def find_tag_internal({_, _, children}, tag), do: find_tag_internal(children, tag)
def find_tag_internal(_, _), do: []
end
17 changes: 9 additions & 8 deletions lib/readability/sanitizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ defmodule Readability.Sanitizer do
alias Readability.Candidate
alias Readability.Candidate.Scoring
alias Readability.Helper
alias Readability.Queries

@type html_tree :: tuple | list

Expand Down Expand Up @@ -45,23 +46,23 @@ defmodule Readability.Sanitizer do
weight + same_tree.score < 0 ->
true

length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
Queries.count_character(tree, ",") < 10 ->
# If there are not very many commas, and the number of
# non-paragraph elements is more than paragraphs or other
# ominous signs, remove the element.
p_len = tree |> Floki.find("p") |> length
img_len = tree |> Floki.find("img") |> length
li_len = tree |> Floki.find("li") |> length
input_len = tree |> Floki.find("input") |> length
p_len = tree |> Queries.find_tag("p") |> length
img_len = tree |> Queries.find_tag("img") |> length
li_len = tree |> Queries.find_tag("li") |> length
input_len = tree |> Queries.find_tag("input") |> length

embed_len =
tree
|> Floki.find("embed")
|> Queries.find_tag("embed")
|> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|> length

link_density = Scoring.calc_link_density(tree)
conent_len = Helper.text_length(tree)
conent_len = Queries.text_length(tree)

# too many image
# more <li>s than <p>s
Expand Down Expand Up @@ -93,6 +94,6 @@ defmodule Readability.Sanitizer do
end

defp clean_empty_p?({tag, _, _} = html_tree) do
tag == "p" && Helper.text_length(html_tree) == 0
tag == "p" && Queries.text_length(html_tree) == 0
end
end
9 changes: 7 additions & 2 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ defmodule Readability.Mixfile do
coveralls: :test,
"coveralls.detail": :test,
"coveralls.post": :test,
"coveralls.html": :test
"coveralls.html": :test,
"test.watch": :test
],
package: package(),
deps: deps(),
Expand All @@ -29,14 +30,18 @@ defmodule Readability.Mixfile do
end

defp deps do
# https://github.com/lpil/mix-test.watch/pull/140#issuecomment-1853912030
test_watch_runtime = match?(["test.watch" | _], System.argv())

[
{:floki, "~> 0.24"},
{:httpoison, "~> 1.8 or ~> 2.0"},
{:ex_doc, "~> 0.29", only: :dev},
{:credo, "~> 1.6", only: [:dev, :test]},
{:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false},
{:mock, "~> 0.3", only: :test},
{:excoveralls, "~> 0.18", only: :test}
{:excoveralls, "~> 0.18", only: :test},
{:mix_test_watch, "~> 1.0", only: [:dev, :test], runtime: test_watch_runtime}
]
end

Expand Down
1 change: 1 addition & 0 deletions mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"meck": {:hex, :meck, "0.9.2", "85ccbab053f1db86c7ca240e9fc718170ee5bda03810a6292b5306bf31bae5f5", [:rebar3], [], "hexpm", "81344f561357dc40a8344afa53767c32669153355b626ea9fcbc8da6b3045826"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
"mix_test_watch": {:hex, :mix_test_watch, "1.1.1", "eee6fc570d77ad6851c7bc08de420a47fd1e449ef5ccfa6a77ef68b72e7e51ad", [:mix], [{:file_system, "~> 0.2.1 or ~> 0.3", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "f82262b54dee533467021723892e15c3267349849f1f737526523ecba4e6baae"},
"mock": {:hex, :mock, "0.3.8", "7046a306b71db2488ef54395eeb74df0a7f335a7caca4a3d3875d1fc81c884dd", [:mix], [{:meck, "~> 0.9.2", [hex: :meck, repo: "hexpm", optional: false]}], "hexpm", "7fa82364c97617d79bb7d15571193fc0c4fe5afd0c932cef09426b3ee6fe2022"},
"nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"},
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
Expand Down
4 changes: 0 additions & 4 deletions test/readability/helper_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ defmodule Readability.HelperTest do
assert result == expected
end

test "inner text length" do
assert Helper.text_length(@html_tree) == 5
end

test "strips out special case tags" do
html =
"<html><body><p>Hello <? echo esc_html( wired_get_the_byline_name( $related_video ) ); ?></p></body></html>"
Expand Down
Loading

0 comments on commit b1a6a0e

Please sign in to comment.