-
Notifications
You must be signed in to change notification settings - Fork 3
/
cedict_to_csv.py
28 lines (21 loc) · 928 Bytes
/
cedict_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import re
pattern = re.compile(r"^(.*?) (.*?) \[(.*?)\] /(.*?)$")
# surface -> csv (surface, left id, right id, cost, pinyin, traditional, simplified, definition)
dict = {}
with open("cedict_ts.u8") as f:
for line in f:
line = line.strip()
if line.startswith("#"):
continue
match = pattern.match(line)
if match:
traditional = match.group(1)
simplified = match.group(2)
pinyin = match.group(3)
definition = match.group(4)
cost = int(max(-36000, -400 * (len(traditional) ** 1.5)))
dict[traditional] = f"{traditional},0,0,{cost},*,*,*,*,{pinyin},{traditional},{simplified},{definition}"
dict[simplified] = f"{simplified},0,0,{cost},*,*,*,*,{pinyin},{traditional},{simplified},{definition}"
with open("cedict.csv", mode='w') as f:
for value in dict.values():
f.write(value + "\n")