-
Notifications
You must be signed in to change notification settings - Fork 0
/
MaruGet.py
153 lines (129 loc) · 4.23 KB
/
MaruGet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python
import urllib2
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import NavigableString
from BeautifulSoup import Tag
import re
class MaruBlog:
"""
maruBlog is a simple little class that gets a maru blog post.
"""
# __ENCODING is a private variable that stores the expected
# encoding. This should be had programatically, but meh.
__ENCODING = 'euc-jp'
# entryNumber is the blog entry number. None to get most recent.
entryNumber = int()
# maruUrl is the url path to a maru blog post
maruUrl = "http://sisinmaru.blog17.fc2.com/blog-entry-^ENTRYNUM^.html"
maruUrlNone = "http://sisinmaru.blog17.fc2.com/"
# urlFile is the result from urllib.urlopen() action, which
# is a file-like object.
urlFile = None
# wholeBody is the body of the blog post, in Unicode
wholeBody = None
# entryBodyHtml is the content part of the blog post - that is, pictures
# and text (complete with markup)
entryBodyHtml = None
# entryTitle is the tile of the blog post
entryTitle = None
# soup is the BeauitfulSoup object created out of the post.
soup = None
def __init__(self, entryNumber = None):
self.entryNumber = entryNumber
self.__constructMaruUrl()
self.__getUrl()
self.__readBody()
self.soup = BeautifulSoup(self.wholeBody)
self.__findContent()
self.__setTitle()
return
def __constructMaruUrl(self):
# TODO: Make this work properly if entryNumber == None.
if (self.entryNumber == None):
self.maruUrl = self.maruUrlNone
else:
self.maruUrl = self.maruUrl.replace('^ENTRYNUM^', str(self.entryNumber))
def __getUrl(self):
"""
Gets the url and stores it in a urllib2 object
"""
try:
self.urlFile = urllib2.urlopen(self.maruUrl)
except HTTPError:
self.urlFile = None
return
def __readBody(self):
"""
Takes the entire body, converts it to Unicode, and
stores it in self.wholeBody
"""
body = self.urlFile.read()
self.wholeBody = unicode(body, self.__ENCODING)
return
def __findContent(self):
"""
Finds the exciting content in the blog post.
"""
self.entryBodyHtml = self.soup.find('div', {'class':'entry_body'})
return
def __setTitle(self):
"""
Sets the title of the blog post.
"""
try:
self.entryTitle = self.soup.find('div', {'class':'entry_title'}).a.getText()
except AttributeError:
self.entryTitle = None
return
def ircContent(self):
"""
Returns a string of the content, properly formatted for posting an in IRC channel.
"""
# This could be interesting. The blog posts are messes of <br />
# tags, and not using something HTML-aware could be painful.
# This will most likely be accomplished using parts of Beautiful
# Soup.
# There are a few tags that seem to be used in fc2 blogs, at least
# this one in particular.
# * br tags are abused for formatting
# * img tags contain pictures
# and are stored within a tags.
# * Text is just placed outside of everything.
# Text is of type BeautifulSoup.NavigableString.
ircString = unicode()
try:
for item in self.entryBodyHtml:
if (type(item) == Tag):
if(item.getText() != ''):
# Some link text
ircString = ircString.rstrip('\n')
ircString += item.getText() + u" "
ircString += u"< " + item.get("href") + u" >"
if(item.find('img') != None):
# Image.
ircString += item.find('img')['src'] + u"\n"
elif(item.find('embed') != None):
# Youtube video.
ircString += item.find('embed')['src'] + u"\n"
# Text:
if (type(item) == NavigableString):
ircString += item + u"\n"
except TypeError:
ircString = u"No blog post found."
return ircString
def latestPost(self):
"""
Returns the integer number of the most recent blog post.
"""
allLinks = self.soup.findAll('a')
for link in allLinks:
url = link.get("href")
result = re.match("http://sisinmaru.blog17.fc2.com/blog-entry-....html", url)
if (result != None):
pos = re.search("[0-9][0-9][0-9]", url)
start = pos.start()
end = pos.end()
return int(url[start:end])
return None
# Yes, I know this is goofy, I'm too lazy to fix it.
# vim: set swiftwidth=4 tabstop=8