-
Notifications
You must be signed in to change notification settings - Fork 0
/
post_scraper.py
198 lines (160 loc) · 6.99 KB
/
post_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import time
import random
import string
import re
import keywords
def download(url, user_agent='wswp', num_retries=2, proxies=None):
print('Downloading', url)
headers = {'User-Agent' : user_agent}
try:
resp = requests.get(url, headers=headers, proxies=proxies)
html = resp.text
if resp.status_code >= 400:
print('Download error:', resp.text)
html = None
if num_retries and 500 <= resp.status_code < 600:
return download(url, num_retries - 1)
except requests.exceptions.RequestException as e:
# print('Download error:', e.reason)
html = None
except requests.exceptions.InvalidHeader as e:
html=None
return html
def decode_post_date(src):
dict_month = {'Jan':'01', 'Feb':'02', 'Mar':'03', 'Apr':'04', 'May':'05', 'Jun':'06',
'Jul':'07', 'Aug':'08', 'Sep':'09', 'Oct':'10', 'Nov':'11', 'Dec':'12'}
if '분전' in src or '시간전' in src:
date = datetime.today().strftime("%Y-%m-%d")
else:
m = dict_month[src[:3]]
d = src[4:6]
y = src[7:]
date = y + '-' + m + '-' + d
return date
def no_spaces(str):
text = str.replace(" ", '')
text = text.replace('\n', '')
return text
def over_thousand(str):
punct = string.punctuation
if '만' in str:
text = re.sub(r'[^\w\s]','',str)
text = text.replace('만','')
text = text + '000'
return text
text = str.replace(',','')
return text
def scrap_single_post(url):
html = download(url)
if html is None:
return None
soup = BeautifulSoup(html, 'lxml')
title = soup.find('h1',attrs={'class':'cover_title'}).text
sub_title = soup.find('p', attrs={'class':'cover_sub_title'}).text
body_text = soup.find_all(['p','h2','h3','h4','h5','h6'], attrs={'class':'wrap_item item_type_text'})
text = ''
for s in body_text:
text = text + '\n' + s.text.replace('\xa0',' ')
likes = soup.find('span', attrs={'class': 'f_l text_like_count text_default text_with_img_ico ico_likeit_like #like'}).text
likes = 0 if likes=='' else int(over_thousand(likes))
try:
num_comments = soup.find('span', attrs={'class':'f_l text_comment_count text_default text_with_img_ico'}).text
num_comments = 0 if num_comments=='' else int(over_thousand(num_comments))
except AttributeError:
num_comments = 0
post_date = soup.find('span',attrs={'class':'f_l date'}).text
post_date = decode_post_date(post_date)
keyword_list = []
try:
first_keyword = soup.find('ul', attrs={'class':'list_keyword'}).li
keyword_list.append(no_spaces(first_keyword.get_text()))
other_keywords = first_keyword.find_next_siblings('li')
for keyword in other_keywords:
keyword_list.append(no_spaces(keyword.text))
except AttributeError:
pass
author = soup.find('span', attrs={'class':'f_l text_author #author'}).a.text
author_page = soup.find('span', attrs={'class':'f_l text_author #author'}).a['href']
author_id = author_page.replace('https://brunch.co.kr/','')
try:
author_belong = soup.find('span', attrs={'class':'author_belong'}).span.find_next_sibling('span').get_text()
except AttributeError:
author_belong = ""
try:
author_desc = soup.find('p', attrs={'class':'txt_desc'}).text
except AttributeError:
author_desc = ""
num_subscription = soup.find('span', attrs={'class':'num_subscription'}).text
num_subscription = 0 if num_subscription=='' else int(over_thousand(num_subscription))
scrap_result = {'title':title, 'sub_title':sub_title, 'body_text':text, 'keyword':keyword_list, 'likes':likes,
'num_comments':num_comments,'post_date':post_date, 'post_url':url, 'author':author, 'author_id':author_id,
'author_belong':author_belong, 'author_desc':author_desc, 'num_subscription':num_subscription }
return scrap_result
def print_scrap_result(scrap_dict):
print('title:', scrap_dict['title'])
print('sub_title:', scrap_dict['sub_title'])
print('body_text:',scrap_dict['body_text'])
print('keywords:', scrap_dict['keyword'])
print('likes:',scrap_dict['likes'])
print('num_comments:', scrap_dict['num_comments'])
print('post_date:', scrap_dict['post_date'])
print('post_url:', scrap_dict['post_url'])
print('author:', scrap_dict['author'])
print('author_id:', scrap_dict['author_id'])
print('author_belong:',scrap_dict['author_belong'])
print('author_desc:', scrap_dict['author_desc'])
print('num_subscription:', scrap_dict['num_subscription'])
def get_urls_from_csv(keyword):
url_list = []
with open('keyword_url_list/' + keyword + '_url_list.csv', mode='r') as fp:
reader = csv.reader(fp)
for line in reader:
url_list = line
print(keyword, '로 수집한 url은 총 ', len(url_list),'개 입니다.')
return url_list
def get_post_dict_list(url_list, keyword):
dict_list = []
for url in url_list:
result = scrap_single_post(url)
# print_scrap_result(result)
if result is None:
continue
dict_list.append(result)
time.sleep(random.uniform(0.005,0.01))
return dict_list
def save_dict_list_to_csv(keyword, dict_list):
labels = ['title','sub_title','body_text','keyword', 'likes',
'num_comments','post_date','post_url','author',
'author_id','author_belong','author_desc','num_subscription' ]
try:
with open( keyword + '_dataset.csv', 'w', -1, 'utf-8', newline='') as fp:
writer = csv.DictWriter(fp, fieldnames = labels)
writer.writeheader()
for elem in dict_list:
writer.writerow(elem)
except IOError:
print("I/O error")
def get_sample_file(url):
sample = scrap_single_post(url)
print_scrap_result(sample)
with open('sample.csv', 'w', -1, 'utf-8', newline='') as fp:
labels = ['title','sub_title','body_text','keyword', 'likes',
'num_comments','post_date','post_url','author',
'author_id','author_belong','author_desc','num_subscription' ]
writer = csv.DictWriter(fp, fieldnames= labels)
writer.writeheader()
writer.writerow(sample)
keywords = keywords.get_keywords()
for keyword in keywords:
start = time.time()
url_list = get_urls_from_csv(keyword)
dict_list = get_post_dict_list(url_list, keyword)
save_dict_list_to_csv(keyword, dict_list)
end = time.time() - start
m = int(end // 60)
s = int(end % 60)
print(keyword + '_dataset.csv을 파일로 저장했습니다. 총',len(dict_list),'개의 글.', m,'분', s,'초 소요됨.')