-
Notifications
You must be signed in to change notification settings - Fork 4
/
crawler.py
64 lines (56 loc) · 2.05 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import logging
import random
import sys
import time
import requests
import notify
from config import GROUP_LIST, HEADERS, REQUEST_INTERVAL
from parse import parse_list, parse_detail
def __get(url):
response = requests.get(url, headers=HEADERS)
html = response.text
if response.ok:
return html
if response.status_code == 404:
logging.warning(f'{url}不存在')
return None
if '你没有权限访问这个页面' in html:
logging.warning(f'{url}无权访问')
return None
logging.error('request %s is error.\nstatus_code:%s,text:%s',
url, response.status_code, html)
notify.send_msg(f'请求[{url}]({url})失败,状态码:{response.status_code},请修复后继续.')
logging.info(GROUP_LIST)
sys.exit()
def crawl_list(group_id, start_time, start=None):
"""获取帖子列表
:param group_id: 小组ID
:param start_time: 监控帖子的起始时间
:param start: 分页参数
"""
url = f'https://www.douban.com/group/{group_id}/'
if start:
url += f'/discussion?start={start}'
html = __get(url)
if not html:
return []
post_list = parse_list(html)
posts = [x for x in post_list if x['time'] > start_time]
if len(post_list) == 0 or len(posts) != len(post_list):
# 列表没有数据,或者存在时间比start_time小的内容,终止获取帖子列表
return posts
time.sleep(random.randint(REQUEST_INTERVAL[0], REQUEST_INTERVAL[1]))
return post_list + crawl_list(group_id, start_time, start + 25 if start else 50)
def crawl_detail(url, start_time):
"""
获取帖子详情
"""
html = __get(url)
if not html:
return {}
post = parse_detail(html)
post['url'] = url
if notify.meet_condition(post, start_time):
msg = f'**标题**:[{post["title"]}]({url})\n**租金**:{post["rent"]}\n**发布时间**:{post["create_time"]}\n**作者**:[{post["author"]["name"]}]({post["author"]["url"]})\n**内容**:{post["content"]}'
notify.send_msg(msg)
return post