Skip to content

Commit

Permalink
Add published_at to the extracted attributes. (#58)
Browse files Browse the repository at this point in the history
  • Loading branch information
philipbrown authored Nov 5, 2024
1 parent 654a94a commit e9a80fc
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 14 deletions.
32 changes: 19 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ summary = Readability.summarize(url)
summary.title
#=> "Why I’m betting on Elixir"

summary.published_at
#=> ~U[2015-02-23 16:53:27.006Z]

summary.authors
#=> ["Ken Mazaika"]

Expand All @@ -62,6 +65,9 @@ summary.article_text
### Extract the title.
Readability.title(html)

### Extract the published at
Readability.published_at(html)

### Extract authors.
Readability.authors(html)

Expand Down Expand Up @@ -93,11 +99,11 @@ url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
summary = Readability.summarize(url, [clean_conditionally: false])
```

* `:min_text_length` \\\\ 25
* `:remove_unlikely_candidates` \\\\ true
* `:weight_classes` \\\\ true
* `:clean_conditionally` \\\\ true
* `:retry_length` \\\\ 250
- `:min_text_length` \\\\ 25
- `:remove_unlikely_candidates` \\\\ true
- `:weight_classes` \\\\ true
- `:clean_conditionally` \\\\ true
- `:retry_length` \\\\ 250

**You can find other algorithm and regex options in `readability.ex`**

Expand All @@ -109,16 +115,17 @@ To run the test suite:

## Todo

* [x] Extract authors
* [x] More configurable
* [x] Summarize function
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
- [x] Extract authors
- [x] More configurable
- [x] Summarize function
- [ ] Convert relative paths into absolute paths of `img#src` and `a#href`

## Contributions are welcome!

Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones) and features of related projects below

**Contributing**

1. **Fork** the repo on GitHub
2. **Clone** the project to your own machine
3. **Commit** changes to your own branch
Expand All @@ -127,12 +134,11 @@ Check out [the main features milestone](https://github.com/keepcosmos/readabilit

NOTE: Be sure to merge the latest from "upstream" before making a pull request!


## Related and Inspired Projects

* [readability.js](https://github.com/mozilla/readability) is a standalone version of the readability library used for Firefox Reader View.
* [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
* [ruby-readability](https://github.com/cantino/ruby-readability) is a tool for extracting the primary readable content of a webpage.
- [readability.js](https://github.com/mozilla/readability) is a standalone version of the readability library used for Firefox Reader View.
- [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
- [ruby-readability](https://github.com/cantino/ruby-readability) is a tool for extracting the primary readable content of a webpage.

## Copyright and License

Expand Down
23 changes: 23 additions & 0 deletions lib/readability.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ defmodule Readability do
# Extract title
Readability.title(html)
# Extract published at
Readability.published_at(html)
# Extract authors.
Readability.authors(html)
Expand All @@ -31,6 +34,7 @@ defmodule Readability do
alias Readability.ArticleBuilder
alias Readability.AuthorFinder
alias Readability.Helper
alias Readability.PublishedAtFinder
alias Readability.Summary
alias Readability.TitleFinder

Expand Down Expand Up @@ -91,6 +95,7 @@ defmodule Readability do
%Summary{
title: title(html_tree),
authors: authors(html_tree),
published_at: published_at(html_tree),
article_html: readable_html(article_tree),
article_text: readable_text(article_tree)
}
Expand Down Expand Up @@ -166,6 +171,24 @@ defmodule Readability do
def authors(html) when is_binary(html), do: html |> Floki.parse_document!() |> authors
def authors(html_tree), do: AuthorFinder.find(html_tree)

@doc """
Extract published_at
## Example
iex> datetime = Readability.published_at(html_str)
%DateTime{}
"""
@spec published_at(binary | html_tree) :: %DateTime{} | %Date{} | nil
def published_at(raw_html) when is_binary(raw_html) do
raw_html
|> Floki.parse_document()
|> published_at()
end

def published_at(html_tree), do: PublishedAtFinder.find(html_tree)

@doc """
Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read.
Expand Down
67 changes: 67 additions & 0 deletions lib/readability/published_at_finder.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
defmodule Readability.PublishedAtFinder do
@moduledoc """
Extract the published at.
"""

@type html_tree :: tuple | list

@strategies [:meta_tag, :time_element, :data_attribute]

@doc """
Extract the published at.
"""
@spec find(html_tree) :: %DateTime{} | %Date{} | nil
def find(html_tree) do
value =
Enum.find_value(@strategies, fn strategy ->
strategy(strategy, html_tree)
end)

if value do
parse(value)
end
end

defp strategy(:meta_tag, html_tree) do
selector = "meta[property='article:published_time'], meta[property='article:published']"

html_tree
|> Floki.attribute(selector, "content")
|> Enum.map(&String.trim/1)
|> List.first()
end

defp strategy(:time_element, html_tree) do
html_tree
|> Floki.find("time")
|> Enum.flat_map(&Floki.attribute(&1, "datetime"))
|> Enum.map(&String.trim/1)
|> List.first()
end

defp strategy(:data_attribute, html_tree) do
html_tree
|> Floki.find("[data-datetime]")
|> Enum.flat_map(&Floki.attribute(&1, "data-datetime"))
|> Enum.map(&String.trim/1)
|> List.first()
end

defp parse(value) do
parse(:datetime, value) || parse(:date, value)
end

defp parse(:datetime, value) do
case DateTime.from_iso8601(value) do
{:ok, datetime, _} -> datetime
_ -> nil
end
end

defp parse(:date, value) do
case Date.from_iso8601(value) do
{:ok, date} -> date
_ -> nil
end
end
end
2 changes: 1 addition & 1 deletion lib/readability/summary.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
defmodule Readability.Summary do
@moduledoc false
defstruct title: nil, authors: [], article_html: nil, article_text: nil
defstruct title: nil, authors: [], article_html: nil, article_text: nil, published_at: nil
end
33 changes: 33 additions & 0 deletions test/readability/published_at_finder_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
defmodule Readability.PublishedAtFinderTest do
use ExUnit.Case, async: true

alias Readability.PublishedAtFinder

test "extracting bbc format published at" do
html = TestHelper.read_parse_fixture("bbc.html")

assert PublishedAtFinder.find(html) == nil
end

test "extracting buzzfeed format published at" do
html = TestHelper.read_parse_fixture("buzzfeed.html")

assert PublishedAtFinder.find(html) == nil
end

test "extracting elixir format published at" do
html = TestHelper.read_parse_fixture("elixir.html")

assert PublishedAtFinder.find(html) == nil
end

test "extracting medium format published at" do
html = TestHelper.read_parse_fixture("medium.html")
assert PublishedAtFinder.find(html) == ~U[2015-01-31 22:58:05.645Z]
end

test "extracting nytimes format published at" do
html = TestHelper.read_parse_fixture("nytimes.html")
assert PublishedAtFinder.find(html) == ~D[2016-03-16]
end
end

0 comments on commit e9a80fc

Please sign in to comment.