-
Notifications
You must be signed in to change notification settings - Fork 0
/
sample_html.py
40 lines (30 loc) · 1.03 KB
/
sample_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding: utf-8 -*-
from html.parser import HTMLParser
# import urllib.request as urllib2
import urllib2
class MyHTMLParser(HTMLParser):
#Initializing lists
lsStartTags = list()
lsEndTags = list()
lsStartEndTags = list()
lsComments = list()
#HTML Parser Methods
def handle_starttag(self, startTag, attrs):
self.lsStartTags.append(startTag)
def handle_endtag(self, endTag):
self.lsEndTags.append(endTag)
def handle_startendtag(self,startendTag, attrs):
self.lsStartEndTags.append(startendTag)
def handle_comment(self,data):
self.lsComments.append(data)
#creating an object of the overridden class
parser = MyHTMLParser()
#Opening NYTimes site using urllib2
html_page = urllib2.urlopen("https://www.nytimes.com/")
#Feeding the content
#parser.feed(str(html_page.read()))
#printing the extracted values
#print(“Start tags”, parser.lsStartTags)
#print(“End tags”, parser.lsEndTags)
#print(“Start End tags”, parser.lsStartEndTags)
#print(“Comments”, parser.lsComments)