-
Notifications
You must be signed in to change notification settings - Fork 1
/
pttcrawler.py
254 lines (229 loc) · 9.09 KB
/
pttcrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# -*- coding: utf8 -*-
import shutil
import os
import time
import requests
import bs4
import re
#InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised.
#urllib3.disable_warnings() #需要用certifi驗證連線 待補
def ptt_request(url):
"""舊寫法from大數學堂,感覺有點冗
res = requests.get(url)
if ("警告︰您即將進入之看板內容需滿十八歲方可瀏覽。" in res.text):
payload = {
"from": url[18:],
"yes": "yes"
}
rs = requests.session()
res = rs.post("https://www.ptt.cc/ask/over18", verify=False, data=payload)
res = rs.get(url, verify=False)
"""
return requests.get(
url=url,
cookies={'over18': '1'}, verify=True
)
def webptt_title_crawler(board_name, title_keywords, search_depth=100):
"""
:param board_name:
:param title_keywords: tuple of key words, using and
:param search_depth: backward search pages
:return:
"""
idx = get_latest_page_index(board_name)
current_depth = 0
urls = []
if not idx:
print("no index Error")
return None
while current_depth < search_depth:
try:
url = "https://www.ptt.cc/bbs/{b_name}/index{index}.html".format(b_name=board_name, index=idx)
res = ptt_request(url)
except:
print('Request error')
return None
soup = bs4.BeautifulSoup(res.text, 'lxml')
for index, container in enumerate(soup.select(".r-ent")):
for title in container.select(".title"):
if title_keywords in title.text:
for tag in title.find_all('a', href=True):
urls.append("https://www.ptt.cc/" + tag["href"])
current_depth += 1
idx -= 1
print("{} result in {} pages...".format(len(urls), current_depth))
print(urls)
return urls
"""
for push_tag in container.select(".hl"):
try:
if push_tag.text == '爆':
push_count = 101
else:
push_count = int(push_tag.text)
except ValueError:
push_count = -1
if push_count >= push_l_bound and title_key_words:
for tag in container.find_all('a', href=True):
urls.append("https://www.ptt.cc/" + tag["href"])
while latest_page_index > 0 and len(urls) < eassy_bound:
latest_page_index -= 1
url = "https://www.ptt.cc/bbs/{b_name}/index{index}.html".format(b_name=board_name, index=latest_page_index)
res = ptt_request(url)
soup = bs4.BeautifulSoup(res.text, 'lxml')
for container in soup.select(".r-ent"):
push_count = 0
for push_tag in container.select(".hl"):
try:
if push_tag.text == '爆':
push_count = 101
else:
push_count = int(push_tag.text)
except ValueError: #XX判斷 待補
push_count = -1
if push_count >= push_l_bound:
for tag in container.find_all('a', href=True):
urls.append("https://www.ptt.cc/" + tag["href"])
print("#urls:{}".format(len(urls)))
return urls
"""
def price_extrater(urls, price_patten=r"""交易價格]:.*\d*[.,]*\d{3,}"""):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'}
cookies = {'_ts_id': ''}
price_patten = re.compile(price_patten)
digits_patten = re.compile(r'\d+')
reports = [] #(price, title, url)
for url in urls:
try:
res = ptt_request(url)
except:
print("Invalid url!: ".format(url))
time.sleep(0.1)
soup = bs4.BeautifulSoup(res.text, 'lxml')
match = re.search(price_patten, soup.prettify())
if match:
digits = digits_patten.findall(match.group())
reports.append(("".join(digits), soup.title.text, url))
return reports
def webptt_push_crwaler(board_name, eassy_bound=1, push_l_bound=0):
latest_page_index = get_latest_page_index(board_name)
if not latest_page_index:
print("no index Error")
return None
try:
url = "https://www.ptt.cc/bbs/{b_name}/index{index}.html".format(b_name=board_name, index=latest_page_index)
res = ptt_request(url)
except:
print('Request error')
return None
soup = bs4.BeautifulSoup(res.text, 'lxml')
urls = []
for index, container in enumerate(soup.select(".r-ent")):
push_count = 0
for push_tag in container.select(".hl"):
try:
if push_tag.text == '爆':
push_count = 101
else:
push_count = int(push_tag.text)
except ValueError:
push_count = -1
if push_count >= push_l_bound:
for tag in container.find_all('a', href=True):
urls.append("https://www.ptt.cc/" + tag["href"])
while latest_page_index > 0 and len(urls) < eassy_bound:
latest_page_index -= 1
url = "https://www.ptt.cc/bbs/{b_name}/index{index}.html".format(b_name=board_name, index=latest_page_index)
res = ptt_request(url)
soup = bs4.BeautifulSoup(res.text, 'lxml')
for container in soup.select(".r-ent"):
push_count = 0
for push_tag in container.select(".hl"):
try:
if push_tag.text == '爆':
push_count = 101
else:
push_count = int(push_tag.text)
except ValueError: #XX判斷 待補
push_count = -1
if push_count >= push_l_bound:
for tag in container.find_all('a', href=True):
urls.append("https://www.ptt.cc/" + tag["href"])
print("#urls:{}".format(len(urls)))
return urls
def get_latest_page_index(board_name):
url = "https://www.ptt.cc/bbs/{}/index.html".format(board_name)
res = ptt_request(url)
if res:
soup = bs4.BeautifulSoup(res.text, 'lxml')
for i in soup.select(".btn-group"):
for tag in i.find_all('a', href=True):
if tag.text == "‹ 上頁":
latest_page = 1 + int(tag['href'].split('/')[-1].split('.')[0][5:])
return latest_page
else:
print("NameError")
return None
def photo_crawler(url, direction='photo'):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'}
cookies = {'_ts_id': ''}
try:
res = ptt_request(url)
except:
print("Invalid url!")
return None
time.sleep(0.1)
photo_prefixs = ("jpg", "png", "bmp", "gif")
invalid_char = tuple([i for i in "*/\[]:;|=,.?<> "])
imgs = []
soup = bs4.BeautifulSoup(res.text, 'lxml')
for img in soup.find_all('a', href=True):
if img['href'].split('.')[-1] in photo_prefixs:
imgs.append(img['href'])
if imgs:
for _, i in enumerate(soup.select('.article-metaline')):
if "標題" in i.text:
prefix = i.text[2:]
title = i.text
print(i.text)
if "prefix" not in dir():
prefix = url.split('/')[-1]
title = prefix
for i in invalid_char:
if i in prefix:
prefix = prefix.replace(i, '')
for index, img_url in enumerate(imgs):
try:
img_res = requests.get(img_url, stream=True)
except:
print('request fail, skip this img.{}'.format(prefix))
break
img_name = '{}/{}/{:03d}.{}'.format(direction, prefix,index, img_url.split('.')[-1])
try:
os.makedirs('{}/{}'.format(direction, prefix), exist_ok=True)
with open(img_name, 'wb') as fw:
shutil.copyfileobj(img_res.raw, fw)
except Exception as e:
print('ERROR!!')
print(img_name, url)
print(str(e))
if os.path.exists('{}/{}'.format(direction, prefix)):
info_file = "{}/{}/info.txt".format(direction, prefix)
with open(info_file, 'w', encoding='utf8') as fw:
fw.write(url+'\n')
fw.write(title)
#except:
# print('WriteFile Error', info_file)
def report2csv(report, path='./report.csv'):
with open(path, 'w', encoding='utf8') as fw:
for pair in sorted(report, key=lambda x: x[0]):
line = ",".join(pair)
print(line)
fw.write(line+'\n')
def crawler(board, push_bound = 10, eassy_bound = 10):
urls = webptt_push_crwaler(board, push_bound, eassy_bound)
for url in urls:
photo_crawler(url, direction='{}_{}_{}'.format(board, eassy_bound, push_bound))
#photo_crawler('https://www.ptt.cc/bbs/Gossiping/M.1478099361.A.AF8.html', direction='test')