-
Notifications
You must be signed in to change notification settings - Fork 3
/
crawling(auto).py
121 lines (85 loc) · 2.35 KB
/
crawling(auto).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from PIL import Image
import os
import time
import urllib.request
import urllib
import socket
import shutil
from os import system,chdir
import time
import selenium
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
global time_limit
time_limit = 15
socket.setdefaulttimeout(time_limit)
global n #num of case
n = 10
global url0
url0 = "https://www.google.co.kr/imghp?hl=ko&ogbl"
global file_name
file_name = 't_image'
global image_name
image_name = "road_w_aa"
print("검색할 커맨드를 입력하시오")
search1 = input()
if os.path.exists(file_name):
shutil.rmtree(file_name)
os.makedirs(file_name)
data2 = open('crol_box.txt' , 'w+') #web code
driver = webdriver.Chrome()
driver.get(url = url0)
elem = driver.find_element_by_name("q")
elem.send_keys(search1)
elem.send_keys(Keys.RETURN)
data2.write(driver.page_source)
data2.close()
print("몇 개를 검색하시겠습니까")
n = int(input())
driver.close()
data3 = open('crol.txt', 'w+') # link configz
data4 = open('error_link.txt', 'w+')
train0 = open('train.txt', 'w+') #train_txt
i = 1
while i <= n:
with open('crol_box.txt') as file:
for line in file.readlines():
try:
if i > n:
break
if line.find('.jpg"')== -1:
continue
else:
a = line.find('.jpg"') + 5
b = line.find('"h')
str0 = line[b:a]
str1 = str0[1:len(str0) -1]
#print(str1)
data3.write(str1 + "\n")
urllib.request.urlretrieve(str1, file_name + "/" + image_name + str(i) + '.jpg')
train0.write(file_name + "/test" + str(i) + '.jpg' + "\n")
i+=1
except:
print("error " + str1)
data4.write(str1 + "\n")
i-=1
continue
if i <= n:
data2 = open('crol_box.txt' , 'w+') #web code
driver = webdriver.Chrome()
driver.get(url = url0)
elem = driver.find_element_by_name("q")
elem.send_keys(search1 + " pic " + str(i))
elem.send_keys(Keys.RETURN)
data2.write(driver.page_source)
data2.close()
driver.close()
data3.close()
train0.close()
chdir('labelImg')
os.system('python3 labelImg.py')