-
Notifications
You must be signed in to change notification settings - Fork 1
/
findProds.py
159 lines (141 loc) · 6.29 KB
/
findProds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Python script to parse AliBaba pages and compare to
# Amazon prices, results in order to reveal unique and
# novel arbitrage opportunities systematically.
# Outputs a CSV of manually approved AliBaba items
# deemed capable for arbitrage by a human operator.
#
# @author Transbit
# @email info@transbit.org
# @version 0.1
# @reqs Python v2.7
#
# Licensed under GPLv3
# Import all required modules
from BeautifulSoup import BeautifulSoup
import re, urllib, math, gui
# Constant storage class that allows for
# colored terminal output via ANSI codes
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
def disable(self):
self.HEADER = ''
self.OKBLUE = ''
self.OKGREEN = ''
self.WARNING = ''
self.FAIL = ''
self.ENDC = ''
class ParserMain:
# Extract the item and supplier information
# for all items in the page contained in soup
def aliBabaPageParse(self, soup):
if (self.currPage >= self.numPages):
# Too many pages processed
return None;
soup = BeautifulSoup(urllib.urlopen(self.base_url + "_" + str(self.currPage)).read())
items = []
allProducts = soup.findAll("div", { "class" : "attr" })
allSellers = soup.findAll("div", { "class" : "supplier" })
allPics = soup.findAll("div", { "class" : "pic" })
for i in range(0, len(allProducts)):
productsParse = allProducts[i]
sellersParse = allSellers[i]
picsParse = allPics[i]
productsParse = [str(x) for x in productsParse]
sellersParse = [str(x) for x in sellersParse]
picsParse = [str(x) for x in picsParse]
# Append a map of item attributes for allProducts[i]
items.append(self.aliBabaItemParse(' '.join(productsParse), ' '.join(sellersParse), ' '.join(picsParse)))
return items
# Extract the item and supplier information
# for all items in the sub-pages product, sellerParse
# and return relevant info as a dictionary
# @todo: Clean up some of the product parsing
def aliBabaItemParse(self, productsParse, sellersParse, picsParse):
productsSoup = BeautifulSoup(productsParse)
sellersSoup = BeautifulSoup(sellersParse)
picsSoup = BeautifulSoup(picsParse)
# Parse productsSoup, sellersSoup, extract relevant item attributes
attrs = dict(self.parseAttrs(productsSoup).items() + self.parseAttrs(sellersSoup).items())
# Parse picsSoup, extract relevant item attributes
picTag = picsSoup.find("img")
attrs["name"] = str(picTag["alt"])
attrs["image"] = str(picTag["image-src"])
attrs = dict(attrs.items() + self.parseIcons(sellersSoup).items())
return attrs
# Parse soup, extract relevant item attributes from icon span
# as AliBaba stores them (in a way that makes me a sad panda)
def parseIcons(self, soup):
attrs = {}
goldTag = soup.findAll("a", href="javascript:openGsIcon();")
attrs["isGold"] = len(goldTag) > 0
goldYears = goldTag[0]["title"].split()[2]
goldYearsNum = 0
for i in range(0, len(goldYears)):
try:
goldYearsNum += int(goldYears[i]) * int(math.pow(10, i))
except:
break
attrs["goldYears"] = goldYearsNum
attrs["isEscrow"] = len(soup.findAll("a", attrs={"class":"escrowlogo icon-item"})) > 0
attrs["isOnsiteCheck"] = len(soup.findAll("a", attrs={"class":"onsitelogo icon-item"})) > 0
attrs["isAssessed"] = len(soup.findAll("a", attrs={"class":"cslogo icon-item"})) > 0
return attrs
# Parse soup, extract relevant item attributes as AliBaba stores them
# @todo Clean this up
def parseAttrs(self, soup):
attrs = {}
allItems = soup.findAll("p")
for i in range(0, len(allItems)):
if len(allItems[i].contents) < 1:
continue
for tagoffset in range(0, len(allItems[i].contents)):
if ('attrName' in str(allItems[i].contents[tagoffset])):
break
if tagoffset == (len(allItems[i].contents) - 1):
continue
attrName = ' '.join(str(allItems[i].contents[tagoffset].string).split()).split(":")[0]
attrValue = ' '.join(str(allItems[i].contents[tagoffset+1]).split())
attrs[attrName.strip()] = attrValue.strip()
return attrs
# Serve items one at a time as dictionary
def serveNextItem(self):
if (len(self.cachedItems) == 0):
self.currPage+=1
soup = BeautifulSoup(urllib.urlopen(self.base_url + "_" + str(self.currPage)).read())
self.cachedItems = self.aliBabaPageParse(soup)
return self.cachedItems.pop(0)
# Main function, execution entry-point
def main(self):
self.numPages = 0
self.currPage = 1
self.cachedItems = []
# Accept URL input for category from user
self.base_url = raw_input("Enter a category URL on Alibaba (from http://alibaba.com/): ").strip()
self.require_price = True
self.render_colors = True
self.bcolors = bcolors()
if not self.render_colors:
self.bcolors.disable()
# Load URL if valid, otherwise throw an error and quit the program
try:
pageToParse = urllib.urlopen(self.base_url).read()
print "Base URL "+ self.bcolors.OKGREEN + "OK." + self.bcolors.ENDC
except:
print "Base URL "+ self.bcolors.FAIL + "ERROR." + self.bcolors.ENDC
exit(0)
soup = BeautifulSoup(pageToParse)
# Extract the number of pages in the search using BeautifulSoup
# (necessary to iterate over all pages and extract all items)
self.numPages = soup.find('span', attrs={"class":"page-num"})
self.numPages = int(self.numPages.contents[0].split("of")[1].split(":")[0].strip())
self.cachedItems = self.aliBabaPageParse(soup)
self.gui = gui.mainApp(self)
self.gui.display()
# Program entry-point
mainParser = ParserMain()
mainParser.main()