-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
362 lines (330 loc) · 14.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# Attempt 1: 从亚马逊搜索列表中抓取商品名称,价格,url,并存入csv
from lxml import etree
import requests
import csv
import random
import time
# 以下代码用于反反扒虫措施,因更简洁方法即可完成类似效果故未被使用
# from itertools import cycle
# def get_proxies():
# url = 'https://free-proxy-list.net/'
# response = requests.get(url)
# parser = etree.HTML(response.text)
# proxies = set()
# for i in parser.xpath('//tbody/tr')[:100]:
# if i.xpath('.//td[7][contains(text(),"yes")]'):
# proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
# proxies.add(proxy)
# return proxies
# test_url = "https://www.amazon.com/Apple-Watch-GPS-44mm-Aluminum/dp/B07XR5T85R/ref=sr_1_5?keywords=apple&qid
# =1583891902&sr=8-5"
# user_agent_list = [
# #Chrome
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
# 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# #Firefox
# 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
# 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
# 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
# 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
# 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
# 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
# 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
# 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
# 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
# ]
# proxy_pool = cycle(get_proxies())
# 以上全部为对于亚马逊反机器人指令的对应,因并未被过多侦测故未实际采用
# 数据来源:亚马逊美国版
BASE_URL = "https://www.amazon.com/s?k="
# 浏览器头部数据:根据真实浏览器模仿得来
HEADER = {
"User-Agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 '
'Safari/537.36',
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "max-age=0",
}
# 给定一个物品名称(String)与搜索页面数(int),返回一个来自Amazon的text格式的html回复
def get_search_page(item_name, pg):
target = BASE_URL + item_name + "&page=" + str(pg)
print(target)
for i in range(100):
try:
print("Request #%d" % i)
time.sleep(random.randint(1, 3)) # 减慢程序执行防止反扒虫介入
# proxy = next(proxy_pool) 未被采用
response = requests.get(
target, headers=HEADER)
if response.status_code != 200:
# Request is not successfully handled
print('Request is not successfully handled')
continue
elif response.status_code == 200:
# Request is handled
html = etree.HTML(response.text)
items_with_all_data = './/div[@class="sg-col-inner"]/div[2]//span[' \
'@class="a-price-whole"]/../../../../../../../../../../div[@class="sg-row"][' \
'1]//span[@aria-label][1]/../../../../../../../../.. '
item_titles = html.xpath(
items_with_all_data + '//span[@class="a-size-medium a-color-base a-text-normal"]/node()')
no_result = html.xpath(
'//span[contains(string(),"No results")]'
)
if len(item_titles) == 0:
# Request is blocked by bot detection
print('Request is blocked by bot detection')
continue
elif len(no_result) > 0:
print("Invalid search page, returning")
return None
else:
print("HTML is normal, proceeding to next stage")
return response.text
return None
except:
print("Exception,skipping")
continue
return None
# 给定一个string格式的html,通过Xpath提取搜索中需要的特征,返回一个每行代表一个商品的二维数组
def parse_and_select(res):
html = etree.HTML(res)
items_with_all_data = './/div[@class="sg-col-inner"]/div[2]//span[' \
'@class="a-price-whole"]/../../../../../../../../../../div[@class="sg-row"][1]//span[' \
'@aria-label][1]/../../../../../../../../.. '
item_titles = html.xpath(
items_with_all_data + '//span[@class="a-size-medium a-color-base a-text-normal"]/node()')
item_prices = html.xpath(
items_with_all_data + '//span[@class="a-price"]/span[@class="a-offscreen"]/node()')
item_url = html.xpath(
items_with_all_data + "//a[@class='a-link-normal a-text-normal']/@href")
item_review = html.xpath(
items_with_all_data + '//div[@class="sg-row"][1]//span[@aria-label!="Amazon\'s Choice"][1]/@aria-label'
)
item_review_count = html.xpath(
items_with_all_data + '//div[@class="sg-row"][1]//span[@aria-label][2]/@aria-label'
)
# Data cleaning
url_processed = []
for i in item_url:
url_processed.append("https://www.amazon.com" + i)
review_processed = []
for i in item_review:
try:
number = i[0:3]
review_processed.append(float(number))
except ValueError:
continue
review_count_processed = []
for i in item_review_count:
review_count_processed.append(int(i.replace(',', '')))
# Data Summation
data = []
for i in range(len(item_titles)):
item = [item_titles[i], item_prices[i], url_processed[i], review_processed[i], review_count_processed[i]]
data.append(item)
return data
# 给定一个特定商品的url,返回一个text格式的服务器返回html
def get_item_page(url):
for i in range(100):
try:
print("Request #%d" % i)
time.sleep(random.randint(1, 3))
response = requests.get(
url, headers=HEADER)
if response.status_code != 200:
# Request is not successfully handled
print('Request is not successfully handled')
continue
elif response.status_code == 200:
# Request is handled
html = etree.HTML(response.text)
item_titles = html.xpath(
'// div[ @ id = "centerCol"]'
)
if len(item_titles) == 0:
# Request is blocked by bot detection
print('Request is blocked by bot detection')
continue
print("HTML is normal, proceeding to next stage")
return response.text
return None
except:
print("Exception,skipping")
continue
return None
# 与上一同类函数相似,通过Xpath选择txt格式商品页面html上的重要信息,返回一个单行的代表所有数据的数组
def parse_and_select_item(res):
html = etree.HTML(res)
item_seller = html.xpath(
'//div[@id = "centerCol"]/div[@id="bylineInfo_feature_div"]//a/node()'
)
item_specs = html.xpath(
'//div[@id="twister_feature_div"]//ul/li/@title'
)
item_description = html.xpath(
'//div[@id = "centerCol"]//div[@id="featurebullets_feature_div"]//ul/li[position()>1]/span/node()'
)
# Should be non-zero for prime supported shipping
item_shipping = html.xpath(
'//span[@id = "price-shipping-message"]/b/text()'
)
item_return = html.xpath(
'//div[@id="buybox"]//span[@id = "creturns-return-policy-content"]//a/text()'
)
item_stock_level = html.xpath(
'//div[@id="availability"]/span/node()'
)
item_protection_plan = html.xpath(
'//span[contains(string(),"Add a Protection Plan")]'
)
item_protection_plan_price4 = html.xpath(
'//span/a[contains(string(),"4-Year")]/../span/text()'
)
item_protection_plan_price2 = html.xpath(
'//span/a[contains(string(),"2-Year")]/../span/text()'
)
item_listing_price = html.xpath(
'//td[contains(string(),"List Price:")]/../td[position()=2]/span[contains(string(),"$")]/text()'
)
item_is_amazon_choice = html.xpath(
'//span[contains(string(),"Amazon\'s Choice")]'
)
NA = "N/A"
# Data cleaning
if len(item_seller) == 0:
item_seller = NA
else:
item_seller = item_seller[0]
if len(item_specs) == 0:
item_specs = NA
else:
item_specs_c = []
for i in item_specs:
item_specs_c.append(i[16:])
item_specs = " / ".join(item_specs_c)
if len(item_description) == 0:
item_description = NA
else:
item_description_c = []
for i in item_description:
item_description_c.append(i.replace("\n", ""))
item_description = " \n ".join(item_description_c)
if len(item_shipping) == 0:
item_shipping = NA
else:
item_shipping = item_shipping[0]
if len(item_return) == 0:
item_return = NA
else:
item_return = item_return[0].replace("\n", "")
if len(item_stock_level) == 0:
item_stock_level = NA
else:
item_stock_level = item_stock_level[0].replace("\n", "")
if len(item_protection_plan) > 0:
item_protection_plan = True
else:
item_protection_plan = False
if len(item_protection_plan_price2) == 0:
item_protection_plan_price2 = NA
else:
item_protection_plan_price2 = item_protection_plan_price2[0]
if len(item_protection_plan_price4) == 0:
item_protection_plan_price4 = NA
else:
item_protection_plan_price4 = item_protection_plan_price4[0]
if len(item_listing_price) == 0:
item_listing_price = NA
else:
item_listing_price = item_listing_price[0]
if len(item_is_amazon_choice) == 0:
item_is_amazon_choice = False
else:
item_is_amazon_choice = True
# Data summation
data = [item_is_amazon_choice, item_seller, item_listing_price, item_specs, item_description, item_shipping,
item_return, item_stock_level, item_protection_plan, item_protection_plan_price2,
item_protection_plan_price4]
return data
# 将数据集二维数组按照给定的列数存储为csv文件
def to_csv(final_data, search_item_name):
headers = ["Name", "Price", "Link", "Customer Review", "Review Count", "Is Amazon's Choice (Recommendation)",
"Seller", "Original Listing Price (If Applicable)", "Specs List(If Applicable)",
"Item Description", "Shipping Option", "Returning Option", "Item Stock Level", "Protection Plan Offered",
"Protection Plan Price (2 year)", "Protection Plan Price (4 year)"] # 涵盖的数据列名称
csv_file = open("./data/ListSearch_" + search_item_name + ".csv", "w") # 输出csv名称与设置
writer = csv.writer(csv_file)
writer.writerow(headers)
for i in final_data:
writer.writerow(i)
csv_file.close()
return
# 主程序,通过搜索页搜索与单项搜索结合返回给定商品名称在亚马逊给定搜索页数上的所有信息,返回格式为二维数组
def main(item, page_count):
html = get_search_page(item, page_count)
query_data = parse_and_select(html)
# Start individual Search
data_s = []
# j = 0
for i in query_data:
# if j > 1:
# break
# j += 1
url = i[2]
print(url)
html_i = get_item_page(url)
data_i = parse_and_select_item(html_i)
data_s.append(i + data_i)
# print(data_s[0])
return data_s
print("finished one search page")
if __name__ == "__main__":
# url = "https://www.amazon.com/Gaming-GeForce-i7-9750H-Windows-G531GV-DB76/dp/B07S3L9LPT/ref=sr_1_2_sspa?crid
# =1O5WSBCSPMOBF&keywords=gaming+laptop&qid=1583961076&sprefix=gaming+lap%2Caps%2C159&sr=8-2-spons&psc=1&spLa
# =ZW5jcnlwdGVkUXVhbGlmaWVyPUExOFpTOTM5MDZHNlM4JmVuY3J5cHRlZElkPUEwOTYzOTcyMzFWTDE3S0VKQ0dLUiZlbmNyeXB0ZWRBZElkPUEw
# NTQ0OTExWEkyQ1Q4N0Y2NTJQJndpZGdldE5hbWU9c3BfYXRmJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=="
# html = get_item_page(url) parse_and_select_item(html)
# 用户输入参数
name = input("Please Input the Item You Want To Search")
name = name.replace(" ", "+")
page = input("Please indicate how many pages of data is needed")
try:
page_c = int(page)
if page_c >= 20:
print("TooLarge page number input")
except:
print("invalid page number input")
raise KeyboardInterrupt
# 返回开始确认信息
print("Your search subject is " + name)
print("You want to search for " + page + " times")
input("Press enter to start the search")
print("Search started")
# 开始进行逐页遍历,并显示进度
i = 1
data = []
while page_c > 0:
print("Working on " + str(i) + "'s search, " + str(page_c) + " search left.")
data_si = main(name, i)
data = data + data_si
page_c -= 1
i += 1
# 将最终结果存入csv
to_csv(data, name)
# 显示最终确认信息
print("All search finished and stored")