paralled concurrency scraping

sushil-rgb · May 1, 2023 · e4e69ec · e4e69ec
1 parent 69af028
commit e4e69ec
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 130 deletions.
diff --git a/README.md b/README.md
@@ -21,8 +21,9 @@ the bot in a direct message, and it will repsond with product information.
 
 
 # AmazonMe
-Welcome to the AmazonMe scraper that scrape Amazon product database and save it into excel database. This repository contains the code for a web scraper that can extract information from the Amazon website. The scraper uses the Python Playwright library to automate the process of browsing and extracting data from the website.
+Welcome to the AmazonMe scraper that scrape Amazon product database and save it into excel database. This repository contains the code for a web scraper that can extract information from the Amazon website. The scraper uses the Requests, Beautifulsoup libararyto automate the process of scraping and uses asyncio conncurrency to extract thousands of data from the website.
 To get started, you will need to have Python and the necessary requirements installed on your machine.
+**The bot currently scrapes 20,000 records under 2 minutes. **
 
 ## Install virtual environment:
 It's always a good practice to install a virtual environment before installing necessary requirements:

diff --git a/functionalities/tools.py b/functionalities/tools.py
@@ -1,26 +1,10 @@
+import pandas as pd
 import random
 import yaml
 import re
 import os
 
 
-def randomMe(my_lists, seed=None):
-    """
-    Returns a random item from a list. The function is to ensure the random values generated by function are truly
-    random and not predictable:
-    
-    Args:
-        -my_lists: A list of items to select a random item from
-        -seed: (Optional) An interger than be used to seed the random number geneartor. Default is None.
-        
-    Retunrs:
-        -A random item from the input list.
-    """    
-    random.seed(seed, version=2)    
-    random.shuffle(my_lists)    
-    return random.choice(my_lists)
-
-
 async def verify_amazon(url):
     """
     Verifies if the input URL is a vaild Amazon URL.
@@ -38,6 +22,35 @@ async def verify_amazon(url):
         pass
 
 
+async def export_to_sheet(dicts, name):
+    """
+    Exports a list of dictinaries to an Excel file with the specified name and saves it to a directory called 'Amazon database':
+    
+    Args:
+        -dicts (List[Dict]): A list of dictionaries to export to an Excel file.
+        -name (str): The name to use for the Excel file (without the file extension).
+        
+    Returns:
+        -None
+    """
+    directory_name = 'Amazon database'
+    await create_path(directory_name)
+
+    df = pd.DataFrame(dicts)
+    df.to_excel(f"""{os.getcwd()}//{directory_name}//{name}-Amazon database.xlsx""", index = False)
+    print(f"{name} saved.")
+
+
+def random_values(d_lists):
+    """
+    Returns a random value from a list.
+    
+    Args
+    """
+    idx = random.randint(0, len(d_lists) - 1)
+    return d_lists[idx]
+
+
 async def create_path(dir_name):
     """
     Creates a directory with the specified name if i doesn't already exist.
@@ -55,7 +68,7 @@ async def create_path(dir_name):
         os.mkdir(path_dir)
 
 
-def randomTime(val):
+async def randomTime(val):
     """
     Generates a random time interval between requests to avaoid overloading the server. Scrape resonponsibly.
     
@@ -65,8 +78,8 @@ def randomTime(val):
     Returns:
         -A random interger between 2 and the input value. So, the default time interval is 2 seconds.
     """
-    ranges = [i for i in range(2, val+1)]
-    return randomMe(ranges)
+    ranges = [i for i in range(3, val+1)]
+    return random_values(ranges)
 
 
 def userAgents():
@@ -81,7 +94,7 @@ def userAgents():
     """
     with open('functionalities//user-agents.txt') as f:
         agents = f.read().split("\n")
-        return randomMe(agents)
+        return random_values(agents)
 
 
 def yaml_load(selectors):

diff --git a/main.py b/main.py
@@ -1,30 +1,43 @@
+from functionalities.tools import randomTime, verify_amazon, export_to_sheet
 from scrapers.scraper import Amazon
+import pandas as pd
 import asyncio
 import time
 
 
-if __name__ == '__main__':    
-
-
-    # Define an async main functio that runs the web scraper:
+if __name__ == '__main__':  
+    # Start the timer to measure how long the wb scraping process takes
+    start_time = time.time()
+
+
     async def main():
-        # Instantiate an Amazon object:
+        # You can decrease the time-interval, however I discourage you to do say as the action may overload the server and Amazon may block your IP address
+        sleep = 20
+        base_url = "https://www.amazon.com/s?i=specialty-aps&bbn=16225019011&rh=n%3A7141123011%2Cn%3A16225019011%2Cn%3A1040658&ref=nav_em__nav_desktop_sa_intl_clothing_0_2_13_2"
         amazon = Amazon()
 
-        # Define the URL to scrape:
-        userInput = "https://www.amazon.com/s?k=health+and+beauty&i=beauty-intl-ship&bbn=16225006011&rh=n%3A11062741&dc&ds=v1%3AaTUGn90NLjQvoihGF3%2FqZ1jr%2FIFcsvhBnS3xK%2FaJ3u0&crid=2036DM6EKNYNA&pd_rd_r=fa4603d4-0acc-4de5-a94e-3f047374ec2e&pd_rd_w=LUiIR&pd_rd_wg=yiJls&pf_rd_p=c9097eb6-837b-4ba7-94d7-51428f6e8d2a&pf_rd_r=6W2WTX74X54Y6G5DMXQQ&qid=1682875043&rnid=16225006011&sprefix=health+and+beauty%2Cbeauty-intl-ship%2C173&ref=sr_nr_n_6"
+        if await verify_amazon(base_url):
+            return "Invalid link. Please try proper amazon link product category of your choice."
 
-        # Split the pagination into URLs 
-        split_links = await amazon.split_url(userInput)            
+        # Pull the number of pages of the category
+        number_pages = await amazon.num_of_pages(base_url)
+        print(f"Total pages || {number_pages}.")
+
+        await asyncio.sleep(sleep)
+
+        searches = await amazon.search_results(base_url)
+        print(f"Scraping category || {searches}.")
+
+        # Split the pagination and convert it list of urls
+        url_lists = await amazon.split_url(base_url)        
+
+        print(f"Initiating the Extraction.")
+        coroutines = [amazon.scrape_and_save(sleep, url) for url in url_lists]
+        dfs = await asyncio.gather(*coroutines)
+        results = pd.concat(dfs)
+
+        await export_to_sheet(results, searches)
 
-        # Define the time interval between scraping requests
-        time_interval = 3
-        datas = await amazon.amazonMe(time_interval, split_links)
-        return datas
-
-
-    # Start the timer to measure how long the wb scraping process takes
-    start_time = time.time()
 
     # Run the async main function and run the scraper:
     print(asyncio.run(main()))