-
Notifications
You must be signed in to change notification settings - Fork 2
/
QueryKeywords.py
137 lines (108 loc) · 3.83 KB
/
QueryKeywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding:utf-8 -*-
from module.BingQuery import Bing
from module.GoogleQuery import Google
from module.FlickrQuery import Flickr
from module.FivepxQuery import Fivepx
from module.GettyQuery import Getty
from module.GettyApiQuery import GettyApi
from Crawler import Crawler
from Utils import Utils
import random as rnd
import os, codecs, sys
from multiprocessing import Pool
# query keyword by the engine supplied
def queryk(engines, keyword, seeds):
for engine in engines:
seeds += engine.query_keyword(keyword)
# this is used to deduplicate the urls
def deduplicate(urls):
seeds = []
fseen, useen = set(), set()
for fn, url in urls:
fid = Utils.extract_flickr_id(url)
if fid: ## Flickr Stype URL
if fid not in fseen:
fseen.add(fid)
seeds.append((fn, url))
else: ## Common URL
if url not in useen:
useen.add(url)
seeds.append((fn, url))
return seeds
## one process for one keyword
def run_proc(db_name, keyword, params, engines):
print('Child processs %s start.' % (db_name))
formats = list(
set([x.strip().lower() for x in params['formats'].split(',')]))
min_image_size = int(params.get('min_image_size', 0))
max_image_size = int(params.get('max_image_size', 0))
## get all image url
seeds = []
if type(keyword) == list:
topic = keyword[0]
if int(int(params.get('use_flickr', 0))) == 1 and int(
params.get('flickr_query_tags_only',
0)) == 1: ## query tags only mode
print(":??s")
flickr = Flickr(params)
flickr.query_keyword(keyword[1:])
else:
for _keyword in keyword[1:]:
queryk(engines, _keyword, seeds)
elif type(keyword) == str:
topic = keyword
queryk(engines, keyword, seeds)
seeds = deduplicate(seeds)
rnd.shuffle(seeds)
## download all images by urls
crawler = Crawler(
topic,
params['database'],
formats,
min_image_size=min_image_size,
max_image_size=max_image_size,
db_name=db_name)
with codecs.open(
os.path.join(params['database'], topic, 'urls.txt'), 'w',
'utf-8') as writer:
for fn_, url in seeds:
writer.write('%s,%s\n' % (fn_, url.replace('\n', '')))
crawler.start(seeds)
if __name__ == '__main__':
if len(sys.argv) == 3:
params = Utils.parse_params(sys.argv[1])
keywords = Utils.parse_keywords(sys.argv[2])
engines = []
if int(params.get('use_bing', 0)) == 1:
engines.append(Bing())
if int(params.get('use_google', 0)) == 1:
engines.append(Google())
if int(params.get('use_flickr', 0)) == 1:
engines.append(Flickr(params))
if int(params.get('use_500px', 0)) == 1:
engines.append(Fivepx(params))
if int(params.get('use_getty', 0)) == 1:
engines.append(Getty(params))
if int(params.get('use_getty_api', 0)) == 1:
engines.append(GettyApi(params))
namedict = {u'眼镜男': 'glass_man', u'眼镜女': 'glass_woman'}
p = Pool(4)
for keyword in keywords:
if keyword in namedict.keys():
name = namedict[keyword]
else:
name = 'url' # default put url in table 'url'
p.apply_async(
run_proc, args=(
name,
keyword,
params,
engines,
))
p.close()
p.join()
print('All subprocess done.')
else:
print('Usage : python QueryKeywords.py PARAMS_CFG KEY_WORDS_LIST')
# PARAMS_CFG params' file name
# KEY_WORDS_LIST key_words_list's file name