-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_step_2_extract.py
111 lines (93 loc) · 3.97 KB
/
scrape_step_2_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import asyncio
import openpyxl
from bs4 import BeautifulSoup
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
async def fetch_page_content(semaphore, browser, url, worksheet_write):
async with semaphore:
page = await browser.new_page()
try:
# Navigate to the URL and wait for network idle to ensure full load
await page.goto(url, wait_until="networkidle")
await asyncio.sleep(5) # Optional: Adjust based on loading speed
# Get page content and parse with BeautifulSoup
content = await page.content()
soup = BeautifulSoup(content, "html.parser")
# Initialize a list to hold extracted data
string_list = []
# Define selectors for elements to extract
selectors = [
("DUwDvf lfPIob", "class"),
("F7nice", "class"),
("Copy address", "data-tooltip"),
("Open booking link", "data-tooltip"),
("Open website", "data-tooltip"),
("Copy phone number", "data-tooltip"),
]
# Extract data based on selectors
for selector, selector_type in selectors:
element = (
soup.find(attrs={selector_type: selector})
if selector_type != "class"
else soup.find(class_=selector)
)
if element:
if selector_type == "data-tooltip" and element.has_attr("href"):
href = element["href"]
string_list.append(href)
print(f"Extracted href: {href}")
else:
text = element.get_text(strip=True)
string_list.append(text)
print(f"Extracted text: {text}")
else:
string_list.append("Not Available")
print(f"Element with {selector_type} = '{selector}' not found.")
# Write data to Excel (synchronously to avoid conflicts)
worksheet_write.append(string_list)
except PlaywrightTimeoutError:
print(f"Timeout error while loading {url}")
worksheet_write.append(["Timeout error"])
except Exception as e:
print(f"Error loading {url}: {e}")
worksheet_write.append([f"Error: {e}"])
finally:
await page.close()
async def main(
input_file_path=r"fx.xlsx",
output_file_path=r"fxout.xlsx",
):
# Load the workbook and worksheet for reading URLs
workbook_read = openpyxl.load_workbook(input_file_path)
worksheet_read = workbook_read.active
# Extract website URLs from the Excel file
urls = [
row[0]
for row in worksheet_read.iter_rows(min_row=1, max_col=1, values_only=True)
if row[0] and isinstance(row[0], str) and row[0].startswith("http")
]
workbook_read.close()
# Load the workbook and worksheet for saving data
workbook_write = openpyxl.load_workbook(output_file_path)
worksheet_write = workbook_write.active
# Use `async_playwright` context manager to manage Playwright resources
async with async_playwright() as pw:
browser = await pw.chromium.launch(headless=True)
# ************************************* Control concurrency level *************************************
semaphore = asyncio.Semaphore(3)
tasks = [
fetch_page_content(semaphore, browser, url, worksheet_write) for url in urls
]
await asyncio.gather(*tasks)
await browser.close()
# Save the workbook after all tasks are complete
try:
workbook_write.save(output_file_path)
print("Data saved successfully to Excel.")
except Exception as e:
print(f"Error saving workbook: {e}")
workbook_write.close()
if __name__ == "__main__":
asyncio.run(main())