-
Notifications
You must be signed in to change notification settings - Fork 23
/
pipelines.py
202 lines (159 loc) · 7.24 KB
/
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# -*- coding: utf-8 -*-
# Define your item pipelines here
# If you have many piplelines, all should be init here
# and use IF to judge them
#
# DOUGUO Spider pipelines
# @author zhangjianfei
# @date 2017/04/13
import re
import urllib.request
from DgSpider import urlSettings
from DgSpider import contentSettings
from DgSpider.mysqlUtils import dbhandle_insert_content
from DgSpider.uploadUtils import uploadImage
from DgSpider.mysqlUtils import dbhandle_online
from DgSpider.mysqlUtils import dbhandle_update_status
from bs4 import BeautifulSoup
from DgSpider.PostHandle import post_handel
from DgSpider.commonUtils import get_random_user
from DgSpider.commonUtils import get_linkmd5id
class DgPipeline(object):
# post构造reply
cs = []
# 帖子title
title = ''
# 帖子文本
text = ''
# 当前爬取的url
url = ''
# 随机用户ID
user_id = ''
# 图片flag
has_img = 0
# get title flag
get_title_flag = 0
def __init__(self):
DgPipeline.user_id = get_random_user(contentSettings.CREATE_POST_USER)
# process the data
def process_item(self, item, spider):
self.get_title_flag += 1
# pipeline for content
if spider.name == contentSettings.SPIDER_NAME:
# 获取当前网页url
DgPipeline.url = item['url']
# 获取post title
if len(item['title']) == 0:
title_tmp = ''
else:
title_tmp = item['title'][0]
# 替换标题中可能会引起 sql syntax 的符号
# 对于分页的文章,只取得第一页的标题
if self.get_title_flag == 1:
# 使用beautifulSoup格什化标题
soup_title = BeautifulSoup(title_tmp, "lxml")
title = ''
# 对于bs之后的html树形结构,不使用.prettify(),对于bs, prettify后每一个标签自动换行,造成多个、
# 多行的空格、换行,使用stripped_strings获取文本
for string in soup_title.stripped_strings:
title += string
title = title.replace("'", "”").replace('"', '“')
DgPipeline.title = title
# 获取正post内容
if len(item['text']) == 0:
text_temp = ''
else:
text_temp = item['text'][0]
# 获取图片
reg_img = re.compile(r'<img.*>')
imgs = reg_img.findall(text_temp)
for img in imgs:
DgPipeline.has_img = 1
match_obj = re.search('.*src="(.*)".*', img, re.M | re.I)
img_url_tmp = match_obj.group(1)
# 去除所有Http:标签
img_url_tmp = img_url_tmp.replace("http:", "")
# 对于<img src="http://a.jpg" title="a.jpg">这种情况单独处理
imgUrl_tmp_list = img_url_tmp.split('"')
img_url_tmp = imgUrl_tmp_list[0]
# 加入http
imgUrl = 'http:' + img_url_tmp
list_name = imgUrl.split('/')
file_name = list_name[len(list_name)-1]
# 获取图片本地存储路径
file_path = contentSettings.IMAGES_STORE + file_name
# 获取图片并上传至本地
urllib.request.urlretrieve(imgUrl, file_path)
upload_img_result_json = uploadImage(file_path, 'image/jpeg', DgPipeline.user_id)
# 获取上传之后返回的服务器图片路径、宽、高
img_u = upload_img_result_json['result']['image_url']
img_w = upload_img_result_json['result']['w']
img_h = upload_img_result_json['result']['h']
img_upload_flag = str(img_u)+';'+str(img_w)+';'+str(img_h)
# 在图片前后插入字符标记
text_temp = text_temp.replace(img, '[dgimg]' + img_upload_flag + '[/dgimg]')
# 使用beautifulSoup格什化HTML
soup = BeautifulSoup(text_temp, "lxml")
text = ''
# 对于bs之后的html树形结构,不使用.prettify(),对于bs, prettify后每一个标签自动换行,造成多个、
# 多行的空格、换行
for string in soup.stripped_strings:
text += string + '\n'
# 替换因为双引号为中文双引号,避免 mysql syntax
DgPipeline.text = self.text + text.replace('"', '“')
# 对于分页的文章,每一页之间加入换行
# DgPipeline.text += (DgPipeline.text + '\n')
# pipeline for url
elif spider.name == urlSettings.SPIDER_NAME:
db_object = dbhandle_online()
cursor = db_object.cursor()
for url in item['url']:
linkmd5id = get_linkmd5id(url)
spider_name = contentSettings.SPIDER_NAME
site = urlSettings.DOMAIN
gid = urlSettings.GROUP_ID
module = urlSettings.MODULE
status = '0'
sql_search = 'select md5_url from dg_spider.dg_spider_post where md5_url="%s"' % linkmd5id
sql = 'insert into dg_spider.dg_spider_post(md5_url, url, spider_name, site, gid, module, status) ' \
'values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' \
% (linkmd5id, url, spider_name, site, gid, module, status)
try:
# 判断url是否存在,如果不存在,则插入
cursor.execute(sql_search)
result_search = cursor.fetchone()
if result_search is None or result_search[0].strip() == '':
cursor.execute(sql)
result = cursor.fetchone()
db_object.commit()
except Exception as e:
print(">>> catch exception !")
print(e)
db_object.rollback()
return item
# spider开启时被调用
def open_spider(self, spider):
pass
# sipder 关闭时被调用
def close_spider(self, spider):
if spider.name == contentSettings.SPIDER_NAME:
# 数据入库:235
url = DgPipeline.url
title = DgPipeline.title
content = DgPipeline.text
user_id = DgPipeline.user_id
dbhandle_insert_content(url, title, content, user_id, DgPipeline.has_img)
# 处理文本、设置status、上传至dgCommunity.dg_post
# 如果判断has_img为1,那么上传帖子
if DgPipeline.has_img == 1:
if title.strip() != '' and content.strip() != '':
spider.logger.info('has_img=1,title and content is not null! Uploading post into db...')
post_handel(url)
else:
spider.logger.info('has_img=1,but title or content is null! ready to exit...')
pass
else:
spider.logger.info('has_img=0, changing status and ready to exit...')
pass
elif spider.name == urlSettings.SPIDER_NAME:
pass