forked from YichenQiu/deepdesign.space
-
Notifications
You must be signed in to change notification settings - Fork 0
/
EnglishScraper.py
executable file
·117 lines (109 loc) · 5.38 KB
/
EnglishScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#Import necessary libraries
from urllib.request import urlretrieve
import os.path
import os
import cvlib
import cv2
import numpy as np
import csv
from io import BytesIO
from PIL import Image
import requests
from datetime import datetime
def unique(list1):
# insert the list to the set
list_set = set(list1)
# convert the set to the list
unique_list = (list(list_set))
return unique_list
#Creates a class that contains the scraping essentials
class ScrapingEssentials(object):
#Necessaary Class Variables
currentItem = 0
number = 0
categories = []
#Initializing a directory for the pictures to come in
def __init__(self, source):
ScrapingEssentials.number = 0
self.source = source
file_path_string = "/Users/user/PinterestScraper/Crawler/" + source
if not os.path.exists(file_path_string):
os.makedirs(file_path_string)
# Needed to make a class to have a static way of counting
def reset(self):
ScrapingEssentials.number = 0
ScrapingEssentials.currentItem += 1
#Convert the file from a url to an actual image file and store it on the commputer
def download_image(self, link, title, category):
try:
done = False
#print("processing file: " + str(ScrapingEssentials.number))
#Make a requests object
#Make a folder name
lett = category[0]
file_name = datetime.now().strftime('%Y%m%d%H%M%S%f')
#Make the directory of the folder
file_path_string = "/Users/user/PinterestScraper/Crawler/" + self.source + "/" + category
file_path = os.path.join(file_path_string, (file_name + ".jpg"))
csv_path = os.path.join("/Users/user/PinterestScraper/Crawler/" + self.source + "/", 'labels' + str(lett) + '.csv')
if not os.path.exists(file_path_string):
os.makedirs(file_path_string)
#Download it on the computer
#print(file_path + " " + link)
ScrapingEssentials.number += 1
#urlretrieve(link, file_path)
resp = requests.get(link, stream=True, timeout=10)
if resp.status_code != 200:
resp = requests.get(link.replace(".jpg",".png"), stream=True, timeout=10)
temp_for_image_extension = BytesIO(resp.content)
image = Image.open(temp_for_image_extension)
image_format = image.format
content = resp.content
width, height = image.size
temp_for_image_extension.seek(0)
cv_image = cv2.imdecode(np.frombuffer(temp_for_image_extension.read(), np.uint8), 1)
bbox, objects, conf = cvlib.detect_common_objects(cv_image, model='yolov4', confidence=0.7, enable_gpu=False)
objects.sort()
#firstpos=link.rfind("/")
#lastpos=link.rfind(".")
#name = link[firstpos+1:lastpos]
if min(width, height)>300:
print(str(ScrapingEssentials.number) + ": " + link)
with open(file_path, 'wb') as f:
with open (csv_path,'a+',newline='',encoding="utf-8") as labels:
f.write(content)
writer = csv.writer(labels)
writer.writerow([file_path,width,height,category,title,'|'.join(unique(objects))])
except Exception:
pass
def english_pickle(self, letter):
THEME = ' interior design'
done = ['Abstract','African', 'American Colonial', 'Amish', 'Arabian', 'Art Deco']
A = ['Art Moderne', 'Art Nouveau', 'Artisan', 'Arts and Crafts', 'Asian']
B = ['Baroque', 'Bauhaus', 'Beach House', 'Bohemian', 'Brazilian', 'British Colonial']
C = ['Carolean', 'Chinese', 'Chippendale', 'Coastal', 'Commonwealth', 'Contemporary', 'Cottage', 'Country']
D = ['Danish', 'Directoire', 'Dutch Renaissance']
E = ['Eclectic', 'Egyptian', 'Empire', 'English', 'English Country', 'European', 'Exploration']
F = ['Finnish', 'Flemish', 'French', 'French Provincial']
G = ['Georgian', 'Gothic', 'Greek']
I = ['Indian', 'Industrial', 'Italian']
J = ['Jacobean', 'Japanese']
L = ['Lake House']
M = ['Machine Age', 'Medieval', 'Mediterranean', 'Memphis', 'Mexican', 'Mid-Century Modern', 'Minimalist', 'Mission', 'Modern', 'Modernist', 'Moroccan']
N = ['Nautical', 'Neoclassic', 'Northwestern']
O = ['Old World', 'Organic']
P = ['Palladian', 'Parisian', 'Pennsylvania Dutch', 'Plantation', 'Post-modern', 'Puritan']
Q = ['Queen Anne']
R = ['Regal', 'Regence', 'Regency', 'Renaissance', 'Retro', 'Revival', 'Rietveld', 'Rococo', 'Romantic', 'Russian', 'Rustic']
S = ['Scandinavian', 'Shabby chic', 'Shaker', 'Southwestern', 'Space age', 'Spanish Renaissance', 'Steampunk', 'Swedish']
T = ['Traditional', 'Transitional', 'Tropical', 'Tudor', 'Tuscan']
U = ['Urban']
V = ['Venetian', 'Victorian', 'Vintage']
W = ['Western', 'William and Mary']
Z = ['Zen']
SEARCH_TERMS = []
if letter:
ScrapingEssentials.categories = [s + THEME for s in eval(letter)]
else:
ScrapingEssentials.categories = [s + THEME for s in SEARCH_TERMS]
return ScrapingEssentials.categories