diff --git a/README.md b/README.md index 43b50e0..7179fcf 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,9 @@ the bot in a direct message, and it will repsond with product information. # AmazonMe -Welcome to the AmazonMe scraper that scrape Amazon product database and save it into excel database. This repository contains the code for a web scraper that can extract information from the Amazon website. The scraper uses the Python Playwright library to automate the process of browsing and extracting data from the website. +Welcome to the AmazonMe scraper that scrape Amazon product database and save it into excel database. This repository contains the code for a web scraper that can extract information from the Amazon website. The scraper uses the Requests, Beautifulsoup libararyto automate the process of scraping and uses asyncio conncurrency to extract thousands of data from the website. To get started, you will need to have Python and the necessary requirements installed on your machine. +**The bot currently scrapes 20,000 records under 2 minutes. ** ## Install virtual environment: It's always a good practice to install a virtual environment before installing necessary requirements: diff --git a/functionalities/tools.py b/functionalities/tools.py index 49835e9..71a5cf4 100644 --- a/functionalities/tools.py +++ b/functionalities/tools.py @@ -1,26 +1,10 @@ +import pandas as pd import random import yaml import re import os -def randomMe(my_lists, seed=None): - """ - Returns a random item from a list. The function is to ensure the random values generated by function are truly - random and not predictable: - - Args: - -my_lists: A list of items to select a random item from - -seed: (Optional) An interger than be used to seed the random number geneartor. Default is None. - - Retunrs: - -A random item from the input list. - """ - random.seed(seed, version=2) - random.shuffle(my_lists) - return random.choice(my_lists) - - async def verify_amazon(url): """ Verifies if the input URL is a vaild Amazon URL. @@ -38,6 +22,35 @@ async def verify_amazon(url): pass +async def export_to_sheet(dicts, name): + """ + Exports a list of dictinaries to an Excel file with the specified name and saves it to a directory called 'Amazon database': + + Args: + -dicts (List[Dict]): A list of dictionaries to export to an Excel file. + -name (str): The name to use for the Excel file (without the file extension). + + Returns: + -None + """ + directory_name = 'Amazon database' + await create_path(directory_name) + + df = pd.DataFrame(dicts) + df.to_excel(f"""{os.getcwd()}//{directory_name}//{name}-Amazon database.xlsx""", index = False) + print(f"{name} saved.") + + +def random_values(d_lists): + """ + Returns a random value from a list. + + Args + """ + idx = random.randint(0, len(d_lists) - 1) + return d_lists[idx] + + async def create_path(dir_name): """ Creates a directory with the specified name if i doesn't already exist. @@ -55,7 +68,7 @@ async def create_path(dir_name): os.mkdir(path_dir) -def randomTime(val): +async def randomTime(val): """ Generates a random time interval between requests to avaoid overloading the server. Scrape resonponsibly. @@ -65,8 +78,8 @@ def randomTime(val): Returns: -A random interger between 2 and the input value. So, the default time interval is 2 seconds. """ - ranges = [i for i in range(2, val+1)] - return randomMe(ranges) + ranges = [i for i in range(3, val+1)] + return random_values(ranges) def userAgents(): @@ -81,7 +94,7 @@ def userAgents(): """ with open('functionalities//user-agents.txt') as f: agents = f.read().split("\n") - return randomMe(agents) + return random_values(agents) def yaml_load(selectors): diff --git a/main.py b/main.py index 9b032b0..a6dbec6 100644 --- a/main.py +++ b/main.py @@ -1,30 +1,43 @@ +from functionalities.tools import randomTime, verify_amazon, export_to_sheet from scrapers.scraper import Amazon +import pandas as pd import asyncio import time -if __name__ == '__main__': - - - # Define an async main functio that runs the web scraper: +if __name__ == '__main__': + # Start the timer to measure how long the wb scraping process takes + start_time = time.time() + + async def main(): - # Instantiate an Amazon object: + # You can decrease the time-interval, however I discourage you to do say as the action may overload the server and Amazon may block your IP address + sleep = 20 + base_url = "https://www.amazon.com/s?i=specialty-aps&bbn=16225019011&rh=n%3A7141123011%2Cn%3A16225019011%2Cn%3A1040658&ref=nav_em__nav_desktop_sa_intl_clothing_0_2_13_2" amazon = Amazon() - # Define the URL to scrape: - userInput = "https://www.amazon.com/s?k=health+and+beauty&i=beauty-intl-ship&bbn=16225006011&rh=n%3A11062741&dc&ds=v1%3AaTUGn90NLjQvoihGF3%2FqZ1jr%2FIFcsvhBnS3xK%2FaJ3u0&crid=2036DM6EKNYNA&pd_rd_r=fa4603d4-0acc-4de5-a94e-3f047374ec2e&pd_rd_w=LUiIR&pd_rd_wg=yiJls&pf_rd_p=c9097eb6-837b-4ba7-94d7-51428f6e8d2a&pf_rd_r=6W2WTX74X54Y6G5DMXQQ&qid=1682875043&rnid=16225006011&sprefix=health+and+beauty%2Cbeauty-intl-ship%2C173&ref=sr_nr_n_6" + if await verify_amazon(base_url): + return "Invalid link. Please try proper amazon link product category of your choice." - # Split the pagination into URLs - split_links = await amazon.split_url(userInput) + # Pull the number of pages of the category + number_pages = await amazon.num_of_pages(base_url) + print(f"Total pages || {number_pages}.") + + await asyncio.sleep(sleep) + + searches = await amazon.search_results(base_url) + print(f"Scraping category || {searches}.") + + # Split the pagination and convert it list of urls + url_lists = await amazon.split_url(base_url) + + print(f"Initiating the Extraction.") + coroutines = [amazon.scrape_and_save(sleep, url) for url in url_lists] + dfs = await asyncio.gather(*coroutines) + results = pd.concat(dfs) + + await export_to_sheet(results, searches) - # Define the time interval between scraping requests - time_interval = 3 - datas = await amazon.amazonMe(time_interval, split_links) - return datas - - - # Start the timer to measure how long the wb scraping process takes - start_time = time.time() # Run the async main function and run the scraper: print(asyncio.run(main())) diff --git a/scrapers/scraper.py b/scrapers/scraper.py index a3c4ba8..947d0c1 100644 --- a/scrapers/scraper.py +++ b/scrapers/scraper.py @@ -5,7 +5,6 @@ import asyncio import aiohttp import re -import os class Amazon: @@ -39,11 +38,14 @@ async def static_connection(self, url): Raises: aiohttp.ClientError: If an error occurs while making the request. - """ - async with aiohttp.ClientSession() as session: - async with session.get(url, headers = self.headers) as resp: - content = await resp.read() - return content + """ + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, headers = self.headers) as resp: + content = await resp.read() + return content + except Exception as e: + return f"Content loading erro: URL |> {url} | Error |> {str(e)}." async def num_of_pages(self, url): @@ -72,7 +74,7 @@ async def num_of_pages(self, url): return 2 - async def split_url(self, url): + async def split_url(self, url): """ Splits a given Amazon URL into multiple URLs, with each URL pointing to a different page of search results. @@ -83,29 +85,26 @@ async def split_url(self, url): -list: A list of URLs, with each URL pointing to a different page of search results. """ - # Create a list to store the split URLs, and add the original URL to it. - split_url = [url] - - # Use the 'num_of_pages' method to get the total number of search result pages for the given URL. - total_pages = await self.num_of_pages(url) + # Create a list to store the split URLs, and add the orignal URL to it: + split_url = [url] - print(f"Total number of pages || {str(total_pages)}.") - - # Use the 'static_connection' method to make a static connection to the given URL and get its HTMl content. - content = await self.static_connection(url) + # Use the 'num_of_pages' method to get the total number of search result pages for the given URL: + total_pages = await self.num_of_pages(url) + # Use the 'static_connection' method to make a static connection to the given URL and get its HTML content: + content = await self.static_connection(url) + # Making a soup: - soup = BeautifulSoup(content, 'lxml') + soup = BeautifulSoup(content, 'lxml') - # Get the URL of the next button on the search result page and construct the URL of the next search result page. - next_link = f"""https://www.amazon.com{await self.catch.attributes(soup.select_one(self.scrape['next_button']), 'href')}""" + # Get the URL of the next button on the search result page and costruct the URL of the next search result page: + next_link = f"""https://www.amazon.com{await self.catch.attributes(soup.select_one(self.scrape['next_button']), 'href')}""" - # Loop through all the search result pages and construct a URL for each page. for num in range(1, total_pages): - # Replace the page number in the URL with current page number increment by 1. - next_url = re.sub(r'page=\d+', f'page={num+1}' , next_link) + # Replace the 'page' number in the URL with curren tpage number increment by 1: + next_url = re.sub(r'page=\d+', f'page={num+1}' , next_link) - # Replace the 'sr_pg_' parameter in the URL with the current page number. + # Replace the 'sr_pg_' parameter in the URL with current page number: next_url = re.sub(r'sr_pg_\d+', f'sr_pg_{num}', next_url) split_url.append(next_url) @@ -135,87 +134,94 @@ async def getASIN(self, url): return split_url - - async def amazonMe(self, interval, urls): + + async def search_results(self, url): """ - Scrapes data from multiple pages of an Amazon search result for a given interval of time and saves the data into an Excel file. + Retrieves the name of search results on the given Amazon search page URL. Args: - -interval (int): The time interval between each page request in seconds. - -urls (list): Alist of URLs to scrape data from. - - Returns: - -str: A message indicating the success of the scraping and saving operation. + -url (str): The Amazon search page URL to retrive category name. Raises: - -Exception: If there is an error loading the content from Amazon or extracting data from the HTML. + -AttributeError: If the search results cannot be retrieved from the URL. """ - amazon_dicts = [] - - # Verify if the first URL is a valid Amazon link: - if await verify_amazon(urls[0]): - print("Invalid link. Please try proper amazon link product category of your choice.") - return - - # Get base content and soup from first URL: - base_content = await self.static_connection(urls[0]) - base_soup = BeautifulSoup(base_content, 'lxml') + content = await self.static_connection(url) + soup = BeautifulSoup(content, 'lxml') - # Get search results from first URL: try: - search_results = re.sub(r"""["]""", "", base_soup.select_one(self.scrape['searches']).text.strip()).title() + search_results = re.sub(r"""["]""", "", soup.select_one(self.scrape['searches']).text.strip()).title() except AttributeError: - search_results = base_soup.select_one('span.a-list-item').text.strip() + search_results = soup.select_one('span.a-list-item').text.strip() + + return search_results.replace(":", "") + + + async def scrape_data(self, url): + """ + Scrapes product data from the Amazon search results page for the given URL. + + Args: + -list: A list of dictionaries, with each dictionary containing product data for single product. + + Raises: + -Expecation: If there is an error while loading the content of the Amazon search results page. + """ + amazon_dicts = [] + + # Use the 'static_connection' method to download the HTML content of the search results bage + content = await self.static_connection(url) + soup = BeautifulSoup(content, 'lxml') # Check if main content element exists on page: try: - base_soup.select_one(self.scrape['main_content']) + soup.select_one(self.scrape['main_content']) except Exception as e: - return f"Content loading error. Please try again in few minutes. Error message: {e}" - - # Loop through all the URLs and scrape data from each page: - for pages in range(len(urls)): - # print("\n---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------") - print(f"Scraping pages || {pages + 1}") - print("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------") - - # Get content and soup from current URL: - content = await self.static_connection(urls[pages]) - soup = BeautifulSoup(content, 'lxml') - - # Wait for random interval before making next request - await asyncio.sleep(randomTime(interval)) - - # Get product card contents from current page: - card_contents = soup.select(self.scrape['main_content']) - - # Loop through all product cards and extract data: - for datas in card_contents: - prod_hyperlink = f"""https://www.amazon.com{await self.catch.attributes(datas.select_one(self.scrape['hyperlink']), 'href')}""" - prod_name = await self.catch.text(datas.select_one(self.scrape['hyperlink'])) - print(prod_name) - data = { - 'Product': prod_name, - 'ASIN': await self.getASIN(prod_hyperlink), - 'Price': await self.catch.text(datas.select_one(self.scrape['price'])), - 'Original price': await self.catch.text(datas.select_one(self.scrape['old_price'])), - 'Review': await self.catch.text(datas.select_one(self.scrape['review'])), - 'Review count': await self.catch.text(datas.select_one(self.scrape['review_count'])), - 'Hyperlink': prod_hyperlink, - 'Image url': f"""{await self.catch.attributes(datas.select_one(self.scrape['image']), 'src')}""", - } - amazon_dicts.append(data) - - # Create directory to save Excel file: - directory_name = 'Amazon database' - await create_path(directory_name) - - # Save data to Excel file: - df = pd.DataFrame(amazon_dicts) - df.to_excel(f"{os.getcwd()}//Amazon database//{search_results}-Amazon database.xlsx", index=False) - print(f"{search_results} is saved.") - + return f"Content loading error. Please try again in few minutes. Error message: {e}" + + # Get product card contents from current page: + card_contents = soup.select(self.scrape['main_content']) + + # Loop through all product cards and extract data: + for datas in card_contents: + prod_hyperlink = f"""https://www.amazon.com{await self.catch.attributes(datas.select_one(self.scrape['hyperlink']), 'href')}""" + prod_name = await self.catch.text(datas.select_one(self.scrape['hyperlink'])) + print(prod_name) + data = { + 'Product': prod_name, + 'ASIN': await self.getASIN(prod_hyperlink), + 'Price': await self.catch.text(datas.select_one(self.scrape['price'])), + 'Original price': await self.catch.text(datas.select_one(self.scrape['old_price'])), + 'Review': await self.catch.text(datas.select_one(self.scrape['review'])), + 'Review count': await self.catch.text(datas.select_one(self.scrape['review_count'])), + 'Hyperlink': prod_hyperlink, + 'Image url': f"""{await self.catch.attributes(datas.select_one(self.scrape['image']), 'src')}""", + } + amazon_dicts.append(data) + + return amazon_dicts + + async def scrape_and_save(self, interval, url): + """ + Scrapes data from a given URL, saves it to a file, and returns the scarped data as a Pandas Dataframe. + + Args: + -interval (int): Time interval in seconds to sleep before scraping the data. + -url (str): The URL to scrape data from. + + Returns: + -pd.DataFrame: A Pandas DataFrame containing the scraped data. + + Raises: + -HTTPError: If the HTTP request to the URL returns an error status code. + -Exception: If there is an error while scraping the data. + """ + random_sleep = await randomTime(interval) + await asyncio.sleep(random_sleep) + datas = await self.scrape_data(url) + return pd.DataFrame(datas) + + async def dataByAsin(self, asin): """ Extracts product information from the Amazon product page by ASIN (Amazon Standard Identification Number). diff --git a/scrapers/selector.yaml b/scrapers/selector.yaml index 58f074f..085363f 100644 --- a/scrapers/selector.yaml +++ b/scrapers/selector.yaml @@ -1,6 +1,6 @@ # CSS selectors: product_name: "div.a-section.a-spacing-none.a-spacing-top-small.s-title-instructions-style h2 a span" -searches: "div.a-section.a-spacing-small.a-spacing-top-small span.a-color-state.a-text-bold" +searches: "a.a-link-normal.s-navigation-item" pages: "span.s-pagination-strip span.s-pagination-item.s-pagination-disabled" next_button: "a.s-pagination-item.s-pagination-next.s-pagination-button.s-pagination-separator"