Merge pull request #248 from dpguthrie/add-consent-handling

Add consent handling
dpguthrie · Dec 16, 2023 · 57a73fc · 57a73fc
2 parents ff568c8 + 399284b
commit 57a73fc
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 62 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,23 @@
 Change Log
 ==========
 
+2.3.7
+-----
+## Add
+- Logic for handling setting up a session when a consent screen is encountered.  This is primarily seen in European countries
+  and should allow for the continued use of this package.
+- Keyword argument, `setup_url`, to the base `_YahooFinance` class that allows a user to override the url used in setting up the session.  As a default
+  the Yahoo Finance home page is used (https://finance.yahoo.com).  You can also create an environment variable, `YF_SETUP_URL` that will be used if set.
+  Example usage:
+  ```python
+  import yahooquery as yq
+
+  t = yq.Ticker('aapl', setup_url='https://finance.yahoo.com/quote/AAPL')
+  ```
+
+## Remove
+- Webdriver manager is no longer used internally.  Selenium Manager is now fully included with selenium `4.10.0`, so this package is no longer needed.
+
 2.3.6
 -----
 ## Fix
@@ -110,7 +127,7 @@ Change Log
   to adjust the timezone (:code:`adj_timezone`) to the ticker's timezone. It defaults
   to :code:`True`.
 - Further documentation of acceptable keyword arguments to the :code:`Ticker` class.
-- :code:`Ticker.news` is now a method.  It accepts two arguments:  :code:`count` - 
+- :code:`Ticker.news` is now a method.  It accepts two arguments:  :code:`count` -
   number of items to return; :code:`start` - start date to begin retrieving news items from
 - Bug fixes:  :code:`Ticker.history` method no longer returns extra rows when retrieving
   intraday data.
@@ -131,12 +148,12 @@ Change Log
   :code:`p_valuation_measures` and supply either :code:`a`, :code:`q`, or
   :code:`m` (annual, quarterly, monthly).  The data returned with these can
   be seen in the `Statistics` tab through the Yahoo Finance front-end.
-  
+
 .. image:: demo/valuation_measures.PNG
 
 2.2.2
 -----
-- Fix bug in retrieving cash flow / income statement data.  Most recent month was 
+- Fix bug in retrieving cash flow / income statement data.  Most recent month was
   combining with TTM. A new column was created in the dataframe called 'periodType'.
   Annual data will be shown as '12M', quarterly data will be shown as '3M', and
   trailing 12 month data will be shown as 'TTM'.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "yahooquery"
-version = "2.3.6"
+version = "2.3.7"
 description = "Python wrapper for an unofficial Yahoo Finance API"
 authors = ["Doug Guthrie <douglas.p.guthrie@gmail.com>"]
 documentation = "https://yahooquery.dpguthrie.com"
@@ -16,7 +16,7 @@ requests-futures = "^1.0.1"
 tqdm = "^4.65.0"
 lxml = "^4.9.3"
 selenium = {version = "^4.10.0", optional = true}
-webdriver-manager = {version = "^3.8.6", optional = true}
+beautifulsoup4 = "^4.12.2"
 
 [tool.poetry.dev-dependencies]
 pytest = "^7.4.0"
@@ -34,4 +34,4 @@ requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.extras]
-premium = ["selenium", "webdriver-manager"]
+premium = ["selenium"]
diff --git a/yahooquery/__init__.py b/yahooquery/__init__.py
@@ -1,15 +1,15 @@
 """Python interface to unofficial Yahoo Finance API endpoints"""
 
 name = "yahooquery"
-__version__ = "2.3.6"
+__version__ = "2.3.7"
 
-from .research import Research  # noqa
-from .ticker import Ticker  # noqa
-from .screener import Screener  # noqa
 from .misc import (  # noqa
     get_currencies,
     get_exchanges,
     get_market_summary,
     get_trending,
     search,
 )
+from .research import Research  # noqa
+from .screener import Screener  # noqa
+from .ticker import Ticker  # noqa
diff --git a/yahooquery/base.py b/yahooquery/base.py
@@ -1,4 +1,5 @@
 # stdlib
+import logging
 import os
 import time
 from concurrent.futures import as_completed
@@ -26,6 +27,9 @@
     import urlparse as parse
 
 
+logger = logging.getLogger(__name__)
+
+
 class _YahooFinance(object):
     CHUNK = 1500
 
@@ -938,11 +942,12 @@ def __init__(self, **kwargs):
         self.progress = kwargs.pop("progress", False)
         self.username = kwargs.pop("username", os.getenv("YF_USERNAME", None))
         self.password = kwargs.pop("password", os.getenv("YF_PASSWORD", None))
+        self._setup_url = kwargs.pop("setup_url", os.getenv("YF_SETUP_URL", None))
         self.session = initialize_session(kwargs.pop("session", None), **kwargs)
         if self.username and self.password:
             self.login()
         else:
-            self.session = setup_session(self.session)
+            self.session = setup_session(self.session, self._setup_url)
         self.crumb = get_crumb(self.session)
 
     @property
@@ -991,13 +996,27 @@ def default_query_params(self):
             params["crumb"] = self.crumb
         return params
 
-    def login(self):
+    def login(self) -> None:
         if _has_selenium:
             instance = YahooFinanceHeadless(self.username, self.password)
             instance.login()
-            self.session.cookies = instance.cookies
+            if instance.cookies:
+                self.session.cookies = instance.cookies
+                return
 
-        return []
+            else:
+                logger.warning(
+                    "Unable to login and/or retrieve the appropriate cookies.  This is "
+                    "most likely due to Yahoo Finance instituting recaptcha, which "
+                    "this package does not support."
+                )
+
+        else:
+            logger.warning(
+                "You do not have the required libraries to use this feature.  Install "
+                "with the following: `pip install yahooquery[premium]`"
+            )
+        self.session = setup_session(self.session, self._setup_url)
 
     def _chunk_symbols(self, key, params={}, chunk=None, **kwargs):
         current_symbols = self.symbols

diff --git a/yahooquery/headless.py b/yahooquery/headless.py
@@ -7,13 +7,11 @@
 try:
     # third party
     from selenium import webdriver
-    from selenium.common.exceptions import NoSuchElementException, TimeoutException
-    from selenium.webdriver.chrome.options import Options
-    from selenium.webdriver.chrome.service import Service as ChromeService
+    from selenium.common.exceptions import TimeoutException
+    from selenium.webdriver.chrome.service import Service
     from selenium.webdriver.common.by import By
     from selenium.webdriver.support import expected_conditions as EC
     from selenium.webdriver.support.ui import WebDriverWait
-    from webdriver_manager.chrome import ChromeDriverManager
 except ImportError:
     # Selenium was not installed
     _has_selenium = False
@@ -28,16 +26,14 @@ def __init__(self, username: str, password: str):
         self.username = username
         self.password = password
         self.cookies = RequestsCookieJar()
-        chrome_options = Options()
+        chrome_options = webdriver.ChromeOptions()
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--log-level=3")
         chrome_options.add_argument("--ignore-certificate-errors")
         chrome_options.add_argument("--ignore-ssl-errors")
-        self.driver = webdriver.Chrome(
-            service=ChromeService(ChromeDriverManager().install()),
-            options=chrome_options,
-        )
+        service = Service()
+        self.driver = webdriver.Chrome(service=service, options=chrome_options)
 
     def login(self):
         try:

diff --git a/yahooquery/misc.py b/yahooquery/misc.py
@@ -1,3 +1,6 @@
+# stdlib
+import os
+
 # third party
 import pandas as pd
 
@@ -20,8 +23,9 @@ def _make_request(
                     country, ", ".join(sorted(COUNTRIES.keys()))
                 )
             )
+    setup_url = kwargs.pop("setup_url", os.getenv("YF_SETUP_URL", None))
     session = initialize_session(**kwargs)
-    session = setup_session(session)
+    session = setup_session(session, setup_url)
     crumb = get_crumb(session)
     if crumb is not None:
         params["crumb"] = crumb

diff --git a/yahooquery/utils/__init__.py b/yahooquery/utils/__init__.py
@@ -7,6 +7,7 @@
 # third party
 import pandas as pd
 import requests
+from bs4 import BeautifulSoup
 from requests.adapters import HTTPAdapter
 from requests.exceptions import ConnectionError, RetryError, SSLError
 from requests.packages.urllib3.util.retry import Retry
@@ -17,7 +18,7 @@
 
 
 DEFAULT_TIMEOUT = 5
-
+DEFAULT_SESSION_URL = "https://finance.yahoo.com"
 CRUMB_FAILURE = (
     "Failed to obtain crumb.  Ability to retrieve data will be significantly limited."
 )
@@ -1366,8 +1367,8 @@ def initialize_session(session=None, **kwargs):
     return session
 
 
-def setup_session(session: requests.Session):
-    url = "https://finance.yahoo.com"
+def setup_session(session: requests.Session, url: str = None):
+    url = url or DEFAULT_SESSION_URL
     try:
         response = session.get(url, allow_redirects=True)
     except SSLError:
@@ -1380,10 +1381,39 @@ def setup_session(session: requests.Session):
             except SSLError:
                 counter += 1
 
-    if not isinstance(session, FuturesSession):
-        return session
+    if isinstance(session, FuturesSession):
+        response = response.result()
+
+    # check for and handle consent page:w
+    if response.url.find("consent") >= 0:
+        logger.debug(f'Redirected to consent page: "{response.url}"')
+
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        params = {}
+        for param in ["csrfToken", "sessionId"]:
+            try:
+                params[param] = soup.find("input", attrs={"name": param})["value"]
+            except Exception as exc:
+                logger.critical(
+                    f'Failed to find or extract "{param}" from response. Exception={exc}'
+                )
+                return session
+
+        logger.debug(f"params: {params}")
+
+        response = session.post(
+            "https://consent.yahoo.com/v2/collectConsent",
+            data={
+                "agree": ["agree", "agree"],
+                "consentUUID": "default",
+                "sessionId": params["sessionId"],
+                "csrfToken": params["csrfToken"],
+                "originalDoneUrl": url,
+                "namespace": "yahoo",
+            },
+        )
 
-    _ = response.result()
     return session