-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchromium.py
43 lines (38 loc) · 1.62 KB
/
chromium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.utils import ChromeType
os.environ['WDM_LOG_LEVEL'] = '0'
def get_soup(siteaddress):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
try:
s = Service(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install())
except ValueError:
print('Error downloading latest driver, trying the local driver now...')
s = Service('/usr/lib/chromium-browser/chromedriver')
except FileNotFoundError:
return "Error: driver not found"
browser = webdriver.Chrome(options=options, service=s)
try:
browser.get(siteaddress)
timeout_in_seconds = 1200
WebDriverWait(browser, timeout_in_seconds).until(ec.presence_of_element_located((By.CLASS_NAME, 'row')))
html = browser.page_source
# parse the webpage for html
soup = BeautifulSoup(html, 'html.parser')
soup = soup.get_text()
#print(soup)
except TimeoutException:
#print("Did not find class_name 'row'...giving up...")
soup = "Error: Did not find class_name 'row'...could not parse webpage"
finally:
browser.quit()
return soup