-
Notifications
You must be signed in to change notification settings - Fork 4
/
handler.py
114 lines (88 loc) · 4.02 KB
/
handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
'''
This is a simple little script that tries to grab every urls from all the SecurityNow episodes and display them in a simple list.
The reason for its existence is for all us people listening but who are not getting an option to write down the links.
It uses the transcript provided from http://www.grc.com/sn/sn-123.txt where 123 is the epoisode number.
This script works without any database, just a dump of text files.
'''
import time
import urllib2
import os
import re
import glob
MYDIR = os.path.abspath(os.path.dirname(__file__))
class SN(object):
txt_url = 'http://www.grc.com/sn/sn-%.3d.txt'
dump_folder = '%s/sn-txt' % MYDIR
markup = '' # We will store all our markup here..
def dumper(self):
# Find the last episode number we have, and make the next one our start
try:
num = int(re.findall(r'[0-9]+', sorted(glob.glob('%s/sn-*.txt' % self.dump_folder))[-1])[0]) + 1
except IndexError:
# Probably empty folder, start at 1
num = 1
while True:
file_path = '%s/sn-%.3d.txt' % (self.dump_folder, num)
cur_url = self.txt_url % num
if os.path.isfile(file_path):
num += 1
continue
print 'Trying to grab', cur_url
try:
text = urllib2.urlopen(cur_url).read()
except urllib2.HTTPError:
print 'HTTP error, aborting, probably 404, means we are at the last episode.'
# Hopefully a 404 :)
break
with open(file_path, 'w') as fp:
print 'Writing episode to %s' % file_path
fp.write(text)
time.sleep(1)
num += 1
def _get_links(self, raw_txt, episode_num):
all_links = []
#all_links += re.findall(r'bit.ly/[a-zA-Z0-9-]+', raw_txt) # Covered by the rule below
all_links += re.findall(r'[a-zA-Z0-9-]{3,30}\.[a-zA-Z]{2,5}/[^ \n\r]+', raw_txt)
final_list = []
for link in all_links:
# Lowercase domain names
parts = link.split('/')
parts[0] = parts[0].lower()
link = '/'.join(parts)
link = re.sub(r'[\.,";)]+$', '', link) # Take away some ending characters
link = re.sub(r'[\[\]]', '', link) # Take away [, ] characters, which are some times put around the links in the text
if link in [
'creativecommons.org/licenses/by-nc-sa/2.5/', 'grc.com/securitynow.htm', 'grc.com/feedback', 'twit.tv/sn',
'grc.com/sn/SN-%s.mp3' % str(episode_num), 'grc.com/sn', 'grc.com/sn/feedback', 'twitter.com/SGgrc',
]:
# Some links are too generic, skip them..
continue
final_list.append(link)
return list(set(final_list))
def _create_list(self, episode_num, links):
if not links:
return None
cur_episode_markup = '### Episode %s\n\n' % str(episode_num)
links = ['* [%s](http://%s)' % (l, l) for l in links]
cur_episode_markup += '\n'.join(links)
self.markup += '%s\n\n' % cur_episode_markup
def write_markdown(self):
links_md = '%s/links.md' % MYDIR
gen_info = '## SecurityNow links (newest on top)\n\n'
with open(links_md, 'w') as fp:
print 'writing to', links_md
fp.write(gen_info + self.markup)
def handler(self):
# Loop trough what we got in a reversed sorted by number fashion. (number are part of filename, so little hackish)
for txt in sorted(glob.glob('%s/sn-*.txt' % self.dump_folder), reverse=True, key=lambda x: int(re.sub(r'[^0-9]', '', x))):
episode_num = re.findall(r'[0-9]+', txt.split('/')[-1])[0]
with open(txt, 'r') as fp:
raw_txt = fp.read()
links = self._get_links(raw_txt, episode_num)
self._create_list(episode_num, links)
if __name__ == '__main__':
sn = SN()
sn.dumper()
sn.handler()
sn.write_markdown()