-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add published_at to the extracted attributes. (#58)
- Loading branch information
1 parent
654a94a
commit e9a80fc
Showing
5 changed files
with
143 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
defmodule Readability.PublishedAtFinder do | ||
@moduledoc """ | ||
Extract the published at. | ||
""" | ||
|
||
@type html_tree :: tuple | list | ||
|
||
@strategies [:meta_tag, :time_element, :data_attribute] | ||
|
||
@doc """ | ||
Extract the published at. | ||
""" | ||
@spec find(html_tree) :: %DateTime{} | %Date{} | nil | ||
def find(html_tree) do | ||
value = | ||
Enum.find_value(@strategies, fn strategy -> | ||
strategy(strategy, html_tree) | ||
end) | ||
|
||
if value do | ||
parse(value) | ||
end | ||
end | ||
|
||
defp strategy(:meta_tag, html_tree) do | ||
selector = "meta[property='article:published_time'], meta[property='article:published']" | ||
|
||
html_tree | ||
|> Floki.attribute(selector, "content") | ||
|> Enum.map(&String.trim/1) | ||
|> List.first() | ||
end | ||
|
||
defp strategy(:time_element, html_tree) do | ||
html_tree | ||
|> Floki.find("time") | ||
|> Enum.flat_map(&Floki.attribute(&1, "datetime")) | ||
|> Enum.map(&String.trim/1) | ||
|> List.first() | ||
end | ||
|
||
defp strategy(:data_attribute, html_tree) do | ||
html_tree | ||
|> Floki.find("[data-datetime]") | ||
|> Enum.flat_map(&Floki.attribute(&1, "data-datetime")) | ||
|> Enum.map(&String.trim/1) | ||
|> List.first() | ||
end | ||
|
||
defp parse(value) do | ||
parse(:datetime, value) || parse(:date, value) | ||
end | ||
|
||
defp parse(:datetime, value) do | ||
case DateTime.from_iso8601(value) do | ||
{:ok, datetime, _} -> datetime | ||
_ -> nil | ||
end | ||
end | ||
|
||
defp parse(:date, value) do | ||
case Date.from_iso8601(value) do | ||
{:ok, date} -> date | ||
_ -> nil | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
defmodule Readability.Summary do | ||
@moduledoc false | ||
defstruct title: nil, authors: [], article_html: nil, article_text: nil | ||
defstruct title: nil, authors: [], article_html: nil, article_text: nil, published_at: nil | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
defmodule Readability.PublishedAtFinderTest do | ||
use ExUnit.Case, async: true | ||
|
||
alias Readability.PublishedAtFinder | ||
|
||
test "extracting bbc format published at" do | ||
html = TestHelper.read_parse_fixture("bbc.html") | ||
|
||
assert PublishedAtFinder.find(html) == nil | ||
end | ||
|
||
test "extracting buzzfeed format published at" do | ||
html = TestHelper.read_parse_fixture("buzzfeed.html") | ||
|
||
assert PublishedAtFinder.find(html) == nil | ||
end | ||
|
||
test "extracting elixir format published at" do | ||
html = TestHelper.read_parse_fixture("elixir.html") | ||
|
||
assert PublishedAtFinder.find(html) == nil | ||
end | ||
|
||
test "extracting medium format published at" do | ||
html = TestHelper.read_parse_fixture("medium.html") | ||
assert PublishedAtFinder.find(html) == ~U[2015-01-31 22:58:05.645Z] | ||
end | ||
|
||
test "extracting nytimes format published at" do | ||
html = TestHelper.read_parse_fixture("nytimes.html") | ||
assert PublishedAtFinder.find(html) == ~D[2016-03-16] | ||
end | ||
end |