Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance x100 for bigger pages #53

Merged
merged 3 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions lib/readability/article_builder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ defmodule Readability.ArticleBuilder do
alias Readability.Candidate.Scoring
alias Readability.CandidateFinder
alias Readability.Helper
alias Readability.Queries
alias Readability.Sanitizer

@type html_tree :: tuple | list
Expand Down Expand Up @@ -35,19 +36,23 @@ defmodule Readability.ArticleBuilder do

html_tree = Cleaner.transform_misused_div_to_p(html_tree)

candidates = CandidateFinder.find(html_tree, opts)
candidates =
html_tree
|> Queries.cache_stats_in_attributes()
|> CandidateFinder.find(opts)

article = find_article(candidates, html_tree)

html_tree = Sanitizer.sanitize(article, candidates, opts)

if Helper.text_length(html_tree) < opts[:retry_length] do
if Queries.text_length(html_tree) < opts[:retry_length] do
if opts = next_try_opts(opts) do
build(origin_tree, opts)
else
html_tree
Queries.clear_stats_from_attributes(html_tree)
end
else
html_tree
Queries.clear_stats_from_attributes(html_tree)
end
end

Expand Down Expand Up @@ -75,7 +80,7 @@ defmodule Readability.ArticleBuilder do
find_article_trees(best_candidate, candidates)
else
fallback_candidate =
case html_tree |> Floki.find("body") do
case html_tree |> Queries.find_tag("body") do
[tree | _] -> %Candidate{html_tree: tree}
_ -> %Candidate{html_tree: {}}
end
Expand All @@ -99,11 +104,10 @@ defmodule Readability.ArticleBuilder do

defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
link_density = Scoring.calc_link_density(html_tree)
inner_text = html_tree |> Floki.text()
inner_length = inner_text |> String.length()
inner_length = Queries.text_length(html_tree)

(inner_length > 80 && link_density < 0.25) ||
(inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
(inner_length < 80 && link_density == 0 && Floki.text(html_tree) =~ ~r/\.( |$)/)
end

defp append?(_), do: false
Expand Down
27 changes: 11 additions & 16 deletions lib/readability/candidate/scoring.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Readability.Candidate.Scoring do
@moduledoc """
Score HTML tree.
"""
alias Readability.Helper
alias Readability.Queries

@element_scores %{"div" => 5, "blockquote" => 3, "form" => -3, "th" => -5}

Expand All @@ -27,9 +27,8 @@ defmodule Readability.Candidate.Scoring do

defp calc_content_score(html_tree) do
score = 1
inner_text = html_tree |> Floki.text()
split_score = inner_text |> String.split(",") |> length
length_score = [String.length(inner_text) / 100, 3] |> Enum.min()
split_score = Queries.count_character(html_tree, ",") + 1
length_score = min(Queries.text_length(html_tree) / 100, 3)
score + split_score + length_score
end

Expand Down Expand Up @@ -58,27 +57,23 @@ defmodule Readability.Candidate.Scoring do
end

def calc_link_density(html_tree) do
link_length =
html_tree
|> Floki.find("a")
|> Floki.text()
|> String.length()

text_length =
html_tree
|> Floki.text()
|> String.length()
text_length = Queries.text_length(html_tree)

if text_length == 0 do
0
else
link_length =
html_tree
|> Queries.find_tag("a")
|> Queries.text_length()

link_length / text_length
end
end

defp calc_children_content_score({_, _, children_tree}) do
children_tree
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1)))
|> calc_content_score
end

Expand All @@ -88,7 +83,7 @@ defmodule Readability.Candidate.Scoring do
|> Enum.filter(&is_tuple(&1))
|> Enum.map(&elem(&1, 2))
|> List.flatten()
|> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
|> Enum.filter(&(is_tuple(&1) && Readability.CandidateFinder.candidate_tag?(&1)))
|> calc_content_score

score / 2
Expand Down
12 changes: 10 additions & 2 deletions lib/readability/candidate_finder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ defmodule Readability.CandidateFinder do

alias Readability.Candidate
alias Readability.Candidate.Scoring
alias Readability.Helper
alias Readability.Queries

@type html_tree :: tuple | list
@type options :: list
Expand Down Expand Up @@ -53,14 +53,22 @@ defmodule Readability.CandidateFinder do
|> Enum.max_by(fn candidate -> candidate.score end)
end

@doc """
Check `html_tree` can be candidate or not.
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?({tag, _, _} = html_tree) do
(tag == "p" || tag == "td") && Queries.text_length(html_tree) >= 25
end

defp candidate?(_, depth \\ 0)
defp candidate?(_, depth) when depth > 2, do: false
defp candidate?([h | t], depth), do: candidate?(h, depth) || candidate?(t, depth)
defp candidate?([], _), do: false
defp candidate?(text, _) when is_binary(text), do: false

defp candidate?({_, _, inner_tree} = html_tree, depth) do
if Helper.candidate_tag?(html_tree) do
if candidate_tag?(html_tree) do
true
else
candidate?(inner_tree, depth + 1)
Expand Down
19 changes: 0 additions & 19 deletions lib/readability/helper.ex
Original file line number Diff line number Diff line change
Expand Up @@ -80,25 +80,6 @@ defmodule Readability.Helper do
end
end

@doc """
Count only text length.
"""
@spec text_length(html_tree) :: number
def text_length(html_tree) do
html_tree |> Floki.text() |> String.trim() |> String.length()
end

@doc """
Check `html_tree` can be candidate or not.
"""
@spec candidate_tag?(html_tree) :: boolean
def candidate_tag?({tag, _, _} = html_tree) do
Enum.any?(["p", "td"], fn candidate_tag ->
tag == candidate_tag &&
text_length(html_tree) >= Readability.default_options()[:min_text_length]
end)
end

@doc """
Normalizes and parses to HTML tree (tuple or list)) from binary HTML.
"""
Expand Down
87 changes: 87 additions & 0 deletions lib/readability/queries.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
defmodule Readability.Queries do
@moduledoc """
Highly-optimized utilities for quick answers about HTML tree
"""

@type html_tree :: tuple | list
@type options :: list

def cache_stats_in_attributes(html_tree) do
Floki.traverse_and_update(html_tree, fn
{tag, attrs, nodes} ->
attrs =
Keyword.put_new_lazy(attrs, :text_length, fn -> text_length({tag, attrs, nodes}) end)

attrs =
Keyword.put_new_lazy(attrs, :commas, fn -> count_character({tag, attrs, nodes}, ",") end)

{tag, attrs, nodes}

other ->
other
end)
end

def clear_stats_from_attributes(html_tree) do
Floki.traverse_and_update(html_tree, fn
{tag, attrs, nodes} ->
{tag, Keyword.drop(attrs, [:text_length, :commas]), nodes}

other ->
other
end)
end

@doc """
Count only text length.
"""
@spec text_length(html_tree) :: number
def text_length(html_tree)
def text_length(text) when is_binary(text), do: String.length(text)
def text_length(nodes) when is_list(nodes), do: Enum.reduce(nodes, 0, &(&2 + text_length(&1)))
def text_length({:comment, _}), do: 0
def text_length({"br", _, _}), do: 1

def text_length({_tag, attrs, nodes}) do
# we precompute that value
Keyword.get_lazy(attrs, :text_length, fn -> text_length(nodes) end)
end

@doc """
Finds number of occurences of a given character, much faster than converting to text
"""
@spec count_character(html_tree, binary) :: number
def count_character(<<v::utf8, rest::binary>>, <<v::utf8>> = char) do
1 + count_character(rest, char)
end

def count_character(<<_::utf8, rest::binary>>, char) do
count_character(rest, char)
end

def count_character(nodes, char) when is_list(nodes) do
Enum.reduce(nodes, 0, &(&2 + count_character(&1, char)))
end

def count_character({_tag, attrs, nodes}, ",") do
Keyword.get_lazy(attrs, :commas, fn -> count_character(nodes, ",") end)
end

def count_character({_tag, _attrs, nodes}, char), do: count_character(nodes, char)
def count_character(_node, _char), do: 0

@doc """
Finds given tags in HTML tree, much faster than using generic selector
"""
@spec find_tag(html_tree, binary) :: list
def find_tag(html_tree, tag), do: html_tree |> find_tag_internal(tag) |> List.flatten()

def find_tag_internal(nodes, tag) when is_list(nodes),
do: Enum.map(nodes, &find_tag_internal(&1, tag))

def find_tag_internal({tag, _, children} = node, tag),
do: [node | find_tag_internal(children, tag)]

def find_tag_internal({_, _, children}, tag), do: find_tag_internal(children, tag)
def find_tag_internal(_, _), do: []
end
17 changes: 9 additions & 8 deletions lib/readability/sanitizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ defmodule Readability.Sanitizer do
alias Readability.Candidate
alias Readability.Candidate.Scoring
alias Readability.Helper
alias Readability.Queries

@type html_tree :: tuple | list

Expand Down Expand Up @@ -45,23 +46,23 @@ defmodule Readability.Sanitizer do
weight + same_tree.score < 0 ->
true

length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
Queries.count_character(tree, ",") < 10 ->
# If there are not very many commas, and the number of
# non-paragraph elements is more than paragraphs or other
# ominous signs, remove the element.
p_len = tree |> Floki.find("p") |> length
img_len = tree |> Floki.find("img") |> length
li_len = tree |> Floki.find("li") |> length
input_len = tree |> Floki.find("input") |> length
p_len = tree |> Queries.find_tag("p") |> length
img_len = tree |> Queries.find_tag("img") |> length
li_len = tree |> Queries.find_tag("li") |> length
input_len = tree |> Queries.find_tag("input") |> length

embed_len =
tree
|> Floki.find("embed")
|> Queries.find_tag("embed")
|> Enum.reject(&(&1 =~ Readability.regexes(:video)))
|> length

link_density = Scoring.calc_link_density(tree)
conent_len = Helper.text_length(tree)
conent_len = Queries.text_length(tree)

# too many image
# more <li>s than <p>s
Expand Down Expand Up @@ -93,6 +94,6 @@ defmodule Readability.Sanitizer do
end

defp clean_empty_p?({tag, _, _} = html_tree) do
tag == "p" && Helper.text_length(html_tree) == 0
tag == "p" && Queries.text_length(html_tree) == 0
end
end
9 changes: 7 additions & 2 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ defmodule Readability.Mixfile do
coveralls: :test,
"coveralls.detail": :test,
"coveralls.post": :test,
"coveralls.html": :test
"coveralls.html": :test,
"test.watch": :test
],
package: package(),
deps: deps(),
Expand All @@ -29,14 +30,18 @@ defmodule Readability.Mixfile do
end

defp deps do
# https://github.com/lpil/mix-test.watch/pull/140#issuecomment-1853912030
test_watch_runtime = match?(["test.watch" | _], System.argv())

[
{:floki, "~> 0.24"},
{:httpoison, "~> 1.8 or ~> 2.0"},
{:ex_doc, "~> 0.29", only: :dev},
{:credo, "~> 1.6", only: [:dev, :test]},
{:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false},
{:mock, "~> 0.3", only: :test},
{:excoveralls, "~> 0.18", only: :test}
{:excoveralls, "~> 0.18", only: :test},
{:mix_test_watch, "~> 1.0", only: [:dev, :test], runtime: test_watch_runtime}
]
end

Expand Down
1 change: 1 addition & 0 deletions mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"meck": {:hex, :meck, "0.9.2", "85ccbab053f1db86c7ca240e9fc718170ee5bda03810a6292b5306bf31bae5f5", [:rebar3], [], "hexpm", "81344f561357dc40a8344afa53767c32669153355b626ea9fcbc8da6b3045826"},
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
"mix_test_watch": {:hex, :mix_test_watch, "1.1.1", "eee6fc570d77ad6851c7bc08de420a47fd1e449ef5ccfa6a77ef68b72e7e51ad", [:mix], [{:file_system, "~> 0.2.1 or ~> 0.3", [hex: :file_system, repo: "hexpm", optional: false]}], "hexpm", "f82262b54dee533467021723892e15c3267349849f1f737526523ecba4e6baae"},
"mock": {:hex, :mock, "0.3.8", "7046a306b71db2488ef54395eeb74df0a7f335a7caca4a3d3875d1fc81c884dd", [:mix], [{:meck, "~> 0.9.2", [hex: :meck, repo: "hexpm", optional: false]}], "hexpm", "7fa82364c97617d79bb7d15571193fc0c4fe5afd0c932cef09426b3ee6fe2022"},
"nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"},
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
Expand Down
4 changes: 0 additions & 4 deletions test/readability/helper_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ defmodule Readability.HelperTest do
assert result == expected
end

test "inner text length" do
assert Helper.text_length(@html_tree) == 5
end

test "strips out special case tags" do
html =
"<html><body><p>Hello <? echo esc_html( wired_get_the_byline_name( $related_video ) ); ?></p></body></html>"
Expand Down
Loading