From 9ece7c327c435e4f377cb8a2e17d28cdff211cd8 Mon Sep 17 00:00:00 2001 From: liamwhite Date: Mon, 4 Mar 2024 10:57:37 -0500 Subject: [PATCH] switch to maintained twitter scraper implementation (#187) --- lib/philomena/scrapers/twitter.ex | 75 +++++-------------------------- 1 file changed, 12 insertions(+), 63 deletions(-) diff --git a/lib/philomena/scrapers/twitter.ex b/lib/philomena/scrapers/twitter.ex index a3585d150..10b17c133 100644 --- a/lib/philomena/scrapers/twitter.ex +++ b/lib/philomena/scrapers/twitter.ex @@ -1,8 +1,5 @@ defmodule Philomena.Scrapers.Twitter do @url_regex ~r|\Ahttps?://(?:mobile\.)?twitter.com/([A-Za-z\d_]+)/status/([\d]+)/?| - @script_regex ~r|="(https://abs.twimg.com/responsive-web/client-web(?:-legacy)?/main\.[\da-z]+\.js)"| - @bearer_regex ~r|"(AAAAAAAAAAAAA[^"]*)"| - @activate_url "https://api.twitter.com/1.1/guest/activate.json" @spec can_handle?(URI.t(), String.t()) :: true | false def can_handle?(_uri, url) do @@ -10,69 +7,21 @@ defmodule Philomena.Scrapers.Twitter do end def scrape(_uri, url) do - api_response!(url) - |> extract_data() - end - - defp extract_data(tweet) do - images = - tweet["entities"]["media"] - |> Enum.map( - &%{ - url: &1["media_url_https"] <> "?format=jpg&name=4096x4096", - camo_url: Camo.Image.image_url(&1["media_url_https"]) - } - ) - - %{ - source_url: tweet["url"], - author_name: tweet["user"], - description: tweet["text"] || tweet["full_text"], - images: images - } - end - - # We'd like to use the API anonymously. In order to do this, we need to - # extract the anonymous bearer token. Fortunately, this is pretty easy - # to identify in the minified mobile script source. - def api_response!(url) do [user, status_id] = Regex.run(@url_regex, url, capture: :all_but_first) - page_url = "https://twitter.com/#{user}/status/#{status_id}" - - api_url = - "https://api.twitter.com/2/timeline/conversation/#{status_id}.json?tweet_mode=extended" - - url = "https://twitter.com/#{user}/status/#{status_id}" + image_url = "https://d.fxtwitter.com/#{user}/status/#{status_id}.jpg" - {gt, bearer} = - Philomena.Http.get(page_url) - |> extract_guest_token_and_bearer() + {:ok, %Tesla.Env{status: 200}} = Philomena.Http.head(image_url) - {:ok, api_resp} = - Philomena.Http.get(api_url, [{"Authorization", "Bearer #{bearer}"}, {"x-guest-token", gt}]) - - api_resp - |> Map.get(:body) - |> Jason.decode!() - |> Map.get("globalObjects") - |> Map.get("tweets") - |> Map.get(status_id) - |> Map.put("user", user) - |> Map.put("url", url) - end - - defp extract_guest_token_and_bearer({:ok, %Tesla.Env{body: page}}) do - [script | _] = Regex.run(@script_regex, page, capture: :all_but_first) - {:ok, %{body: body}} = Philomena.Http.get(script) - - [bearer] = Regex.run(@bearer_regex, body, capture: :all_but_first) - - {:ok, %{body: body}} = - Philomena.Http.post(@activate_url, nil, [{"Authorization", "Bearer #{bearer}"}]) - - gt = Map.fetch!(Jason.decode!(body), "guest_token") - - {gt, bearer} + %{ + source_url: "https://twitter.com/#{user}/status/#{status_id}", + author_name: user, + images: [ + %{ + url: image_url, + camo_url: Camo.Image.image_url(image_url), + }, + ] + } end end