-
Notifications
You must be signed in to change notification settings - Fork 5
/
html_parser.py
52 lines (44 loc) · 1.52 KB
/
html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from html.parser import HTMLParser
index=0
dict={}
file = open('vvrs01-20130301 (20130316)-0416.trs','r',encoding='utf-8')
data = file.read().replace('\t',"")
data2 =data.replace('\n',"")
class MyParser(HTMLParser):
# def handle_startendtag(self, tag, attrs):
# if(tag=="sync"):
# print(attrs)
def handle_data(self, data):
if(len(data) >= 2):
if(self.is_cn_char(data[0]) or (self.is_cn_char(data[1]) and data[0]==" ")):
"""存入字典"""
global index
index = self.update_dict(index, dict, data)
#print(data[1])
elif(data[0]=="-" or data[0]==" "):
"""直接接上一行"""
self.append_dict(index, dict, data)
def is_cn_char(self, i):
"""判斷是否為中文"""
return 0x4e00<=ord(i)<0x9fa6
def update_dict(self,index,dict,data):
"""更新字典"""
dict.update({index:data})
index+=1
return index
def append_dict(self, index, dict, data):
"""將換行的部分接上上一句"""
index_tmp = index - 1
tmp=[]
tmp.append(dict.pop(index_tmp))
tmp.append(data)
tmp1=''.join(tmp)
dict.update({index_tmp:tmp1})
# def change_data_value(self,data):
# tmp = list(data)
# tmp[0]=""
# "".join(tmp)
parser = MyParser(strict=False)
parser.feed(data2)
for i in range(len(dict)):
print(dict[i])