-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
221 lines (162 loc) · 7.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python3
"""
A simple web crawler using the requests and BeautifulSoup Libraries
This script pulls top posts from the Dribbble site and sends them to my Slack channel through their API
This project includes a makefile to make running easy, but requires a specified unique Slack token
Use 'make help' for more information on the classes and their functions
"""
import os
import requests, bs4, sys
import simplejson as json
from datetime import datetime
from bs4 import BeautifulSoup
class Post:
def __init__(self, **kwargs):
for k,v in kwargs.items():
setattr(self, k, v)
def __str__(self):
string = ''
for k,v in vars(self).items():
string += k + '=' + str(v) + '\n'
return string
def format_slack(self):
return '{ "text": "'+ str(self.title) +'", "attachments": [ { "text": "Author: ' + str(self.author) + '\nLikes: ' + str(self.likes) + '\nComments: ' + str(self.comments) + '\nhttps://dribbble.com' + str(self.link) + '", "image_url": "' + str(self.url) +'" } ] }'
class DribbbleCrawler:
def send(self, post, hook):
"""
Send the Post object to Slacks API
Args:
post: Post object containing all the attributes of the Dribbble post
hook: Specifies the Slack Hook for the API to send the post to
"""
# Put together the POST payload, and save emojis by encode using utf-8
body = post.format_slack()
# Shoot the message to slack using the hook we setup at <workspace>.slack.com
r = requests.post(hook, headers={'Content-Type': 'application/json'}, data=body.encode('utf-8'))
write_to_file(post)
def start(self, url, hook):
"""
Performs the actual crawling of the Dribbble top posts page
Args:
url: Specifies the url to crawl
hook: Specifies the Slack Hook for the API to send the post to
"""
# Get the source code, either from a request of the page or saved html file
page_source = get_source(url)
# page_source = get_HTML('pageNew.html')
# Use BeautifulSoup to nicely skip to the post list
soup = BeautifulSoup(page_source, 'html.parser')
# Skip to the posts grouping
items = soup.select('li.shot-thumbnail')
# Iterate through each of the posts
for li in items:
# Get the post ID
post_id = li['id']
# Checks the 'recent_posts.txt' file to check for the current id, if it has been posted, skip to the next post
if not li or match_recent(post_id) == True:
continue
# Inner function to sanitize and fetch attributes by HTML element and CSS class
def select_and_clean(ele, selector):
selection = ele.select_one(selector)
return selection.get_text().strip() if selection else None
# Get the post title
post_title = select_and_clean(li, '.shot-title')
# Get author
post_author = select_and_clean(li, '.display-name')
# Get post Src
post_srcset = li.img.get('src', '').split('?',1)[0]
# Get the ladder end of the link (working link requires the dribbble.com portion too which is added in format_slack())
post_link = li.a.get('href', None)
# Get the current number of likes on the post
likes = select_and_clean(li, '.js-shot-likes-count')
# Get the current number of comments on the post
comments = select_and_clean(li, '.js-shot-comments-count')
post = Post(
id = post_id,
title = post_title,
author = post_author,
url = post_srcset,
link = post_link,
likes = likes,
comments = comments,
date = str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
)
# Send the post to the Slack API and print its attributes to the console
self.send(post, hook)
print(post)
break
def get_HTML(file):
"""
Retrieves the source code from a specified saved html file
args:
file: The Specified html to retrieve the source code from
"""
f = open(file, 'r')
lines = f.readlines()
f.close()
return "".join(lines)
def get_source(url):
"""
Retrieves the source code from a specified url usign the requests library
args:
url: The Specified url to retrieve the source code from
"""
headers = {}
headers['X-Requested-With'] = 'XMLHttpRequest'
return requests.get(url, headers=headers).text
def write_to_file(post):
"""
Writes the new post to the recent_posts history file to prevent double posting top posts
args:
post: The post object to store in history
"""
# Gets the current date to prevent posts over a week from being stored (prevents massive backlog of posts)
cur_date = datetime.now()
with open('recent_posts.txt', 'r+') as f:
# Read the current history in from the file and load it into the dictionary
list = json.loads(f.readline())
# Append the new post
list.append({'id': post.id, 'date': post.date})
# Creates a new list ignoring old posts
# This keeps the history managable, I don't need every post ever posted just the possible repeats
new_list = []
for item in list:
new_date = datetime.strptime(item['date'],"%Y-%m-%d %H:%M:%S")
if (cur_date - new_date).days < 14:
new_list.append(item)
# Overwrite the history file with the new list
f.seek(0)
f.write(json.dumps(new_list))
f.truncate()
f.close()
def match_recent(post_id):
"""
Reads the history files and attempts to match the current id
args:
post_id: The id of the possible new post to ensure it hasn't already been sent to the Slack Channel
return: Returns True/False depending on whether or not a match was made (True=Match was made)
"""
# Ensures that the recent_posts file can be found
if os.path.exists("recent_posts.txt"):
# Open file and read existing JSON object
f = open('recent_posts.txt', 'r')
line = f.readline()
f.close()
list = json.loads(line)
# Check every item for a matching id
for item in list:
if item['id'] == post_id:
return True
return False
# If the file was not found, create it and recall the function
else:
f = open('recent_posts.txt', 'w+')
f.write('[]')
f.close()
match_recent(post_id)
if __name__ == "__main__":
# Ensure we got enough arguements
if len(sys.argv) < 2:
sys.exit(1)
crawler = DribbbleCrawler()
crawler.start('https://dribbble.com/shots?timeframe=week', sys.argv[1])