-
Notifications
You must be signed in to change notification settings - Fork 0
/
category_page_scraper.py
109 lines (94 loc) · 3.47 KB
/
category_page_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import datetime
import time
from amazon_product import amazon_product
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from db_connector import mongoatlas_connector
driver = webdriver.Firefox(
executable_path="/home/gio/PycharmProjects/pythonProject/Scraper/geckodriver/geckodriver")
URL_MONGO_ATALS = "mongodb+srv://singh:K7nhMHqTfUSt7IDN@cluster0.bgnl4.mongodb.net/sconti?retryWrites=true&w=majority"
DOCUMENT_NAME = "sconti"
COLLECTION_NAME = "amazon_deals_pi"
def get_url(url, page):
template = url.replace('{}', str(page))
print(" Scraping: " + template)
return template
def extract_singleCellItem(item, category):
title = item.find('div', {'class': 'dealTile'})
try:
atag = title.find('a', 'a-link-normal')
url = atag.get('href')
if len(url) > 2048:
raise AttributeError
except AttributeError:
print("URL MAGGIORE DI 2048")
raise AttributeError
try:
time = title.find('span', {'role': 'timer'}).text
current_date_and_time = datetime.datetime.now()
hours_added = datetime.timedelta(hours=get_sec(time))
time = current_date_and_time + hours_added
except AttributeError:
time = datetime.datetime.now()
try:
record = get_product_data(url, time, category)
except AttributeError:
print("Error LOADING ITEM")
raise AttributeError
return record
def main(urlInit, category):
for page in range(1, 20):
records = []
mongodb = mongoatlas_connector.AtlasMongoDb.get_initialize(URL_MONGO_ATALS, DOCUMENT_NAME, COLLECTION_NAME)
url = get_url(urlInit, page)
driver.get(url)
driver.maximize_window()
time.sleep(0.75)
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'class': 'singleCell'})
print(len(results))
if len(results) == 0:
results = soup.findAll('div', {'data-component-type': 's-search-result'})
print(results)
for item in results:
record_status = True
try:
record = extract_singleCellItem(item, category)
except AttributeError:
record_status = False
if (record_status):
print(record)
records.append(record)
if len(records) != 0:
mongodb.fill_records_formatted_dict(records)
else:
print("0 ELEMENT FOUND")
time.sleep(3)
return records
def get_product_data(specificUrl, date, category):
driver.get(specificUrl)
driver.maximize_window()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
prodct = amazon_product.AmazonProduct(soup, specificUrl, date, category)
try:
product = prodct.initialize()
except AttributeError:
print("Error during Initialization")
raise AttributeError
return product
def runScraping(url_to_scrape, category):
records = main(url_to_scrape, category)
return records
def get_sec(time_str):
"""Get Seconds from time."""
if str(time_str).count(":") == 2:
h, m, s = time_str.split(':')
else:
h, m = time_str.split(':')
s = "00"
return int(h) + int(m) / 60 + int(s) / 60 / 60