-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScrape_More_Data.py
79 lines (61 loc) · 3.15 KB
/
Scrape_More_Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Created on Sun Mar 18 08:48:27 2018
@author: Aakash Gupta
"""
#####################################################################################################################
# Use this file to scrape more training samples from the World Wide Web
# Used the Bing Search API for finding more training examples
# URL: https://azure.microsoft.com/en-in/services/cognitive-services/bing-web-search-api/
#
#####################################################################################################################
import urllib.request
import requests
import pickle
import os
subscription_key = "#################################"
assert subscription_key
headers = {"Ocp-Apim-Subscription-Key" : subscription_key}
params = {"q": '', "textDecorations":True, "textFormat":"HTML", "count": 20, "size": "medium",
"maxFileSize": 25192}
search_url = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
#'socks', 'sneakers',
commonSearchTerms = ['t-shirts', 'neck-tie', 'underwear', 'boxers', 'pyjamas', 'purses', 'bags',
'sun glasses', 'toys', 'vests', 'merchandise', 'keyholders', 'key chains', 'comics']
## Completed terms
## 'ant man':'Ant-Man', 'aquaman':'Aquaman', 'avengers':'Avengers' , 'batman':'Batman' , 'black panther' :'Black Panther'
## 'captain america':'Captain America' , 'catwoman':'Catwoman', 'ghost rider':'Ghost Rider' , 'hulk':'Hulk' , 'she hulk':'Hulk'
## 'iron man': 'Iron Man' , 'spiderman':'Spiderman' ,
mainTerms = { 'spidey': 'Spiderman', 'amazing spiderman':'Spiderman' , 'superman':'Superman' }
PATH = "C:\\CAX_Superhero_Identify\\train_xtra\\"
for term in mainTerms:
for common in commonSearchTerms:
search_term = term + ' ' + common
params["q"] = search_term
print("SEARCH TERM: "+ search_term)
response = requests.get(search_url, headers=headers, params=params)
response.raise_for_status()
search_results = response.json()
response.raise_for_status()
search_results = response.json()
contentUrl = [img["contentUrl"] for img in search_results["value"][:]]
print("Downloading...")
for idx in range(len(contentUrl)):
url = contentUrl[idx]
if idx == 1:
pass
if('jpg' in url):
filename = search_term + '_' + str(idx) + '.jpg'
filename = PATH + mainTerms[term]+ '\\' + filename
if not os.path.exists(PATH + mainTerms[term]):
os.makedirs(PATH + mainTerms[term])
print(url + ">>>" +filename)
try:
if('image8' not in url):
image_data = urllib.request.urlretrieve(url, filename)
else:
print("Skipping url..")
#image_data.raise_for_status()
#image = Image.open(BytesIO(image_data.content))
#image.save(filename)
except:
print("ERR: "+ url)