-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
87 lines (67 loc) · 3.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import csv
import requests
from bs4 import BeautifulSoup
HEADERS = ({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0',
'Accept-Language': 'en-US, en;q=0.5'})
def scrape_amazon_products ( pages = 20 ):
# create an empty list to store the scraped products
products = []
# loop through the desired number of pages
for i in range (1, pages + 1):
# make a GET request to the URL with the current page number
response = requests.get (
f"https://www.amazon.in/s?k=bags&page={i}&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1",headers = HEADERS)
# parse the HTML content of the response
soup = BeautifulSoup (response.content, "html.parser")
# find all the products on the current page
product_elements = soup.find_all ("div", class_ = "s-result-item")
# loop through the product elements
for product_element in product_elements:
# extract the product URL, name, price, rating, and number of reviews
product_url_element = product_element.find ("a", class_ = "a-link-normal")
product_url = product_url_element ["href"] if product_url_element else ""
product_name_element = product_element.find ("span", class_ = "a-size-medium a-color-base a-text-normal")
product_name = product_name_element.text if product_name_element else ""
product_price_element = product_element.find ("span", class_ = "a-offscreen")
product_price = product_price_element.text if product_price_element else ""
product_rating_element = product_element.find ("span", class_ = "a-icon-alt")
product_rating = product_rating_element.text if product_rating_element else ""
product_reviews_element = product_element.find ("div", class_ = "a-section a-text-normal")
product_reviews = product_reviews_element.text if product_reviews_element else ""
product = {"url": product_url.strip () if product_url else "",
"name": product_name.strip () if product_name else "",
"price": product_price.strip () if product_price else "",
"rating": product_rating.strip () if product_rating else "",
"reviews": product_reviews.strip () if product_reviews else "", }
if product ["url"].strip () and product ["name"].strip () and product ["price"].strip ():
products.append (product)
return products
def scrape_product_details(product_url):
# response = requests.get(product_url,headers = HEADERS)
try:
response = requests.get(product_url,headers = HEADERS)
soup = BeautifulSoup(response.text, "html.parser")
description = soup.find("div", {"id": "productDescription"}).text.strip() if soup.find("div", {"id": "productDescription"}) else ""
asin = soup.find("td", {"class": "a-size-medium a-color-secondary"}).text.strip() if soup.find("td", {"class": "a-size-medium a-color-secondary"}) else ""
manufacturer = soup.find("a", {"id": "bylineInfo"}).text.strip() if soup.find("a", {"id": "bylineInfo"}) else ""
return {
"description": description,
"asin": asin,
"manufacturer": manufacturer,
}
except requests.exceptions.MissingSchema as e:
# log the error or print it to the console
print(f"Error: {e}")
# continue # continue with the next product URL
return {}
def save_to_csv(products):
with open("products.csv", "w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["url", "name", "price", "rating", "reviews", "description", "asin", "manufacturer"])
writer.writeheader()
for product in products:
product_details = scrape_product_details(product["url"])
product.update(product_details)
writer.writerow(product)
products = scrape_amazon_products(20)
save_to_csv(products)