-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_data.py
40 lines (34 loc) · 2.13 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
'''
Piece of code using which I download the images for my dataset from given links.
'''
import os
import requests
from bs4 import BeautifulSoup
import random
urls = [
"https://www.google.com/search?q=acer+tree&source=lnms&tbm=isch&sa=X&ved=2ahUKEwi585TSzLT3AhXSPOwKHeEoB8UQ_AUoAXoECAMQAw&biw=1536&bih=731&dpr=1.25", #acer
"https://www.google.com/search?q=olympic+swimming+pool&tbm=isch&ved=2ahUKEwiu4KKVzbT3AhVOwgIHHef2BGsQ2-cCegQIABAA&oq=olymswimming+pool&gs_lcp=CgNpbWcQARgAMgYIABAHEB4yBggAEAcQHjIGCAAQBxAeMgYIABAHEB4yBggAEAcQHjIGCAAQBxAeMgYIABAHEB4yBggAEAcQHjIGCAAQBxAeMgYIABAHEB46BAgAEEM6BQgAEIAEUMYPWMkTYL8eaABwAHgAgAFLiAHtApIBATWYAQCgAQGqAQtnd3Mtd2l6LWltZ8ABAQ&sclient=img&ei=LmdpYq6RDM6Ei-gP5-2T2AY&bih=731&biw=1536", #pool
"https://www.google.com/search?q=lilek&hl=cs&source=lnms&tbm=isch&sa=X&ved=2ahUKEwjY9YDhz7T3AhWlOOwKHWqTCuIQ_AUoAXoECAIQAw&biw=1536&bih=731&dpr=1.25#imgrc=mhGmNdIuN4vYgM", # lilek
"https://www.google.com/search?q=tule%C5%88&source=lnms&tbm=isch&sa=X&ved=2ahUKEwj37MCP07T3AhWH2aQKHdvTBb8Q_AUoAXoECAIQAw&biw=1536&bih=674&dpr=1.25", #tulen
"https://www.google.com/search?q=dobromysl+obecn%C3%A1&tbm=isch&ved=2ahUKEwia486v07T3AhVQDOwKHTInBtIQ2-cCegQIABAA&oq=dobromy&gs_lcp=CgNpbWcQARgAMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgUIABCABDoECAAQQzoGCAAQBxAeOgQIABAeOggIABCABBCxAzoHCAAQsQMQQzoICAAQsQMQgwE6CwgAEIAEELEDEIMBUMQFWInEBmD20wZoAHAAeACAAd0BiAGKDJIBBjE5LjAuMZgBAKABAaoBC2d3cy13aXotaW1nsAEAwAEB&sclient=img&ei=sG1pYprLA9CYsAeyzpiQDQ&bih=674&biw=1519&hl=cs" # dobromysel
]
index = 1
curls = []
classes = ["acer", "pool", "lilek", "tulen", "dobromysl"]
# my dataset contains 5 classes of following numbers of images
# acer - 69 images
# dobromysl - 77 images
# lilek - 78 images
# pool - 70 images
# tulen - 76 images
i = 0
for url in urls:
page = requests.get(url).text
soup = BeautifulSoup(page, features="html.parser")
for img in soup.find_all("img")[:30]:
curls.append(("curl " + img.attrs['src'][:-2] + " > ", classes[i]))
i += 1
random.shuffle(curls)
for curl in curls:
os.system(curl[0] + "./database/" + str(curl[1]) + str(index).zfill(3) + ".png")
index += 1