-
Notifications
You must be signed in to change notification settings - Fork 0
/
ccedict.py
75 lines (54 loc) · 2.15 KB
/
ccedict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
__author__ = 'Michael'
import codecs
import logging
class Ccedict():
@staticmethod
def splitLine(line):
"""Returns simplified character, pinyin, and definitions in a set"""
if (line.startswith("#")):
return None
pinyin_start = line.find("[")
pinyin_end = line.find("]", pinyin_start)
def_index = line.find("/", pinyin_end)
def_last_index = line.rfind("/") # Remove any trailing characters
if (pinyin_start == -1 or
pinyin_end == -1 or
def_index == -1 or
def_last_index == -1):
return None
hanzi = line[0:pinyin_start].strip().split(" ")[-1] # Simplified is the 2nd one
pinyin = line[pinyin_start + 1:pinyin_end]
return {
'id': "{0}[{1}]".format(hanzi, pinyin),
'hanzi': hanzi,
'pinyin': pinyin,
'definition': line[def_index:def_last_index + 1][1:-1].split("/")
}
def __init__(self, file_name):
"""Constructs a Ccedict object from the given dictionary file."""
self.words = {}
with codecs.open(file_name, "r", "utf-8") as dict:
for line in list(dict):
word = Ccedict.splitLine(line)
if (not word is None):
self.words[word["hanzi"]] = word
logging.info("Loaded cedict ({0} words)".format(len(self.words.keys())))
# Cheap unit tests
if __name__ == "__main__":
entry = Ccedict.splitLine(u"\u4fdd\u62a4")
if (not entry is None):
raise "Entry should be none"
entry = Ccedict.splitLine(u"\u4fdd\u62a4 \u4fdd\u62a5 [pei2gen1] /bacon/")
if (entry is None):
raise "Entry is none"
if (entry["id"] != u"\u4fdd\u62a5[pei2gen1]"):
raise "Entry ID is wrong"
if (entry["definition"][0] != "bacon"):
raise "Definition is wrong"
entry = Ccedict.splitLine(u"\u4fdd\u62a4 [pei2gen1] /bacon/")
if (entry is None):
raise "Entry is none"
if (entry["id"] != u"\u4fdd\u62a4[pei2gen1]"):
raise "Entry ID is wrong"
if (entry["definition"][0] != "bacon"):
raise "Definition is wrong"