-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
103 lines (87 loc) · 2.96 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import requests
import urllib
import http.cookiejar as cookiejar
from bs4 import BeautifulSoup as Beaut
from fake_useragent import UserAgent
from Auction import Auction
from Item import Item
import time
import json
def deco_head():
print(''' ###########################################
###########################################
######## Scraping nHentai.net #########
###########################################
###########################################
V0.1''')
def new_agent():
ua = UserAgent()
return str(ua.random)
def load_html(url, with_cookies=False, headers={}):
"""Attempts to load an HTML page, returning a BeautifulSoup instance. Raises
any networking or parsing exceptions"""
if with_cookies:
cj = cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
else:
opener = urllib.request.build_opener()
request = urllib.request.Request(url, headers=headers)
x = True
count = 1
while x:
try:
response = opener.open(request)
x = False
break
except (urllib.error.HTTPError, ConnectionResetError):
print("404 in load html")
count += 1
if count > 5:
return None
time.sleep(count*count)
html = response.read().decode('utf-8', errors='replace')
the_soup = Beaut(html, 'html.parser')
return the_soup
def directory_maker(dir_name):
print("'")
cwd = os.getcwd()
new_directory = os.path.join(cwd, 'auctions')
if not os.path.exists(new_directory): # check if it exists already
os.mkdir(new_directory)
new_directory = os.path.join(new_directory, str(dir_name))
if not os.path.exists(new_directory):
os.mkdir(new_directory)
return new_directory
def find_auction_links(soup):
# grab the main table containing the auction list
try:
container = soup.find('table', id='auction-list')
except AttributeError:
return False
print("we found the auction list!")
try:
container = container.find_all("div", class_="panel-title text-nowrap")
except AttributeError:
return False
print("we've got the list of panel divs containing tags")
print("There are : " + str(len(container)) + " of them.")
return container
def main():
deco_head()
url = 'https://mckenzieauction.hibid.com/'
auctions_page = url + 'auctions'
# start the soup
headers = {'User-Agent': new_agent()}
soup = load_html(auctions_page, with_cookies=True, headers=headers)
auction_links = find_auction_links(soup)
auction_list = []
for taggroup in auction_links:
link = taggroup.find("a")
name = link.text
link = url+link['href']
current_auction = Auction(name, len(auction_list)+1, link)
auction_list.append(current_auction)
print(auction_list)
if __name__ == "__main__":
main()