-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlparser.py
67 lines (54 loc) · 1.58 KB
/
htmlparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from html.parser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc
class HTMLTextParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []
self.__skip = False
def handle_data(self, data):
if not self.__skip:
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')
def handle_starttag(self, tag, attrs):
self.__skip = False
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')
elif tag == 'style':
self.__skip = True
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')
self.__skip = False
def text(self):
return ''.join(self.__text).strip()
import re
REMOVE_STYLE_RE = re.compile('<style.*</style>', re.IGNORECASE)
def get_text(html):
try:
parser = HTMLTextParser()
parser.feed(re.sub(REMOVE_STYLE_RE, '', html))
parser.close()
return parser.text()
except:
print_exc(file=stderr)
return html
def main():
text = r'''
<html>
<body>
<b>Project:</b> DeHTML<br>
<b>Description</b>:<br>
This small script is intended to allow conversion from HTML markup to
plain text.
</body>
</html>
'''
print(get_text(text))
if __name__ == '__main__':
main()