Skip to content

Commit

Permalink
paralled concurrency scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
sushil-rgb committed May 1, 2023
1 parent 69af028 commit e4e69ec
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 130 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ the bot in a direct message, and it will repsond with product information.


# AmazonMe
Welcome to the AmazonMe scraper that scrape Amazon product database and save it into excel database. This repository contains the code for a web scraper that can extract information from the Amazon website. The scraper uses the Python Playwright library to automate the process of browsing and extracting data from the website.
Welcome to the AmazonMe scraper that scrape Amazon product database and save it into excel database. This repository contains the code for a web scraper that can extract information from the Amazon website. The scraper uses the Requests, Beautifulsoup libararyto automate the process of scraping and uses asyncio conncurrency to extract thousands of data from the website.
To get started, you will need to have Python and the necessary requirements installed on your machine.
**The bot currently scrapes 20,000 records under 2 minutes. **

## Install virtual environment:
It's always a good practice to install a virtual environment before installing necessary requirements:
Expand Down
55 changes: 34 additions & 21 deletions functionalities/tools.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,10 @@
import pandas as pd
import random
import yaml
import re
import os


def randomMe(my_lists, seed=None):
"""
Returns a random item from a list. The function is to ensure the random values generated by function are truly
random and not predictable:
Args:
-my_lists: A list of items to select a random item from
-seed: (Optional) An interger than be used to seed the random number geneartor. Default is None.
Retunrs:
-A random item from the input list.
"""
random.seed(seed, version=2)
random.shuffle(my_lists)
return random.choice(my_lists)


async def verify_amazon(url):
"""
Verifies if the input URL is a vaild Amazon URL.
Expand All @@ -38,6 +22,35 @@ async def verify_amazon(url):
pass


async def export_to_sheet(dicts, name):
"""
Exports a list of dictinaries to an Excel file with the specified name and saves it to a directory called 'Amazon database':
Args:
-dicts (List[Dict]): A list of dictionaries to export to an Excel file.
-name (str): The name to use for the Excel file (without the file extension).
Returns:
-None
"""
directory_name = 'Amazon database'
await create_path(directory_name)

df = pd.DataFrame(dicts)
df.to_excel(f"""{os.getcwd()}//{directory_name}//{name}-Amazon database.xlsx""", index = False)
print(f"{name} saved.")


def random_values(d_lists):
"""
Returns a random value from a list.
Args
"""
idx = random.randint(0, len(d_lists) - 1)
return d_lists[idx]


async def create_path(dir_name):
"""
Creates a directory with the specified name if i doesn't already exist.
Expand All @@ -55,7 +68,7 @@ async def create_path(dir_name):
os.mkdir(path_dir)


def randomTime(val):
async def randomTime(val):
"""
Generates a random time interval between requests to avaoid overloading the server. Scrape resonponsibly.
Expand All @@ -65,8 +78,8 @@ def randomTime(val):
Returns:
-A random interger between 2 and the input value. So, the default time interval is 2 seconds.
"""
ranges = [i for i in range(2, val+1)]
return randomMe(ranges)
ranges = [i for i in range(3, val+1)]
return random_values(ranges)


def userAgents():
Expand All @@ -81,7 +94,7 @@ def userAgents():
"""
with open('functionalities//user-agents.txt') as f:
agents = f.read().split("\n")
return randomMe(agents)
return random_values(agents)


def yaml_load(selectors):
Expand Down
47 changes: 30 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,43 @@
from functionalities.tools import randomTime, verify_amazon, export_to_sheet
from scrapers.scraper import Amazon
import pandas as pd
import asyncio
import time


if __name__ == '__main__':


# Define an async main functio that runs the web scraper:
if __name__ == '__main__':
# Start the timer to measure how long the wb scraping process takes
start_time = time.time()


async def main():
# Instantiate an Amazon object:
# You can decrease the time-interval, however I discourage you to do say as the action may overload the server and Amazon may block your IP address
sleep = 20
base_url = "https://www.amazon.com/s?i=specialty-aps&bbn=16225019011&rh=n%3A7141123011%2Cn%3A16225019011%2Cn%3A1040658&ref=nav_em__nav_desktop_sa_intl_clothing_0_2_13_2"
amazon = Amazon()

# Define the URL to scrape:
userInput = "https://www.amazon.com/s?k=health+and+beauty&i=beauty-intl-ship&bbn=16225006011&rh=n%3A11062741&dc&ds=v1%3AaTUGn90NLjQvoihGF3%2FqZ1jr%2FIFcsvhBnS3xK%2FaJ3u0&crid=2036DM6EKNYNA&pd_rd_r=fa4603d4-0acc-4de5-a94e-3f047374ec2e&pd_rd_w=LUiIR&pd_rd_wg=yiJls&pf_rd_p=c9097eb6-837b-4ba7-94d7-51428f6e8d2a&pf_rd_r=6W2WTX74X54Y6G5DMXQQ&qid=1682875043&rnid=16225006011&sprefix=health+and+beauty%2Cbeauty-intl-ship%2C173&ref=sr_nr_n_6"
if await verify_amazon(base_url):
return "Invalid link. Please try proper amazon link product category of your choice."

# Split the pagination into URLs
split_links = await amazon.split_url(userInput)
# Pull the number of pages of the category
number_pages = await amazon.num_of_pages(base_url)
print(f"Total pages || {number_pages}.")

await asyncio.sleep(sleep)

searches = await amazon.search_results(base_url)
print(f"Scraping category || {searches}.")

# Split the pagination and convert it list of urls
url_lists = await amazon.split_url(base_url)

print(f"Initiating the Extraction.")
coroutines = [amazon.scrape_and_save(sleep, url) for url in url_lists]
dfs = await asyncio.gather(*coroutines)
results = pd.concat(dfs)

await export_to_sheet(results, searches)

# Define the time interval between scraping requests
time_interval = 3
datas = await amazon.amazonMe(time_interval, split_links)
return datas


# Start the timer to measure how long the wb scraping process takes
start_time = time.time()

# Run the async main function and run the scraper:
print(asyncio.run(main()))
Expand Down
Loading

0 comments on commit e4e69ec

Please sign in to comment.