forked from SMART-TTS/SMART-G2P
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
180 lines (151 loc) · 5.61 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import glob
### Making up the dictionary for the transliteration
def read_data(filename):
with open(filename, 'r') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
data = []
for name in glob.glob('transliteration/data/source/*'):
data.append(read_data(name)[3:])
def dataset():
dataset = []
for i in range(len(data)):
dataset += data[i]
return dataset
### Punctuations and symbols
puncs = ['?', '!', '.', ',', '~']
symbols = ['@', '#', '*', '(', ')', '+', '-', ';', ':', '/', '=', '&', '_', "'", '"']
sym_han = ['골뱅이', '샵', '별표', '괄호열고', '괄호닫고', '더하기', '다시', '세미콜론', '땡땡', '짝대기', '는', '그리고', '밑줄', '따옴표', '쌍따옴표']
sym_pro = ['앳', '넘버', '스타', '괄호열고', '괄호닫고', '플러스', '대쉬', '세미콜론', '콜론', '슬래쉬', '이퀄스', '앤드', '언더바', '어퍼스트로피', '쌍따옴표']
count_symbols = ['$', '₩', '£', '¥', '€', '℃', '%']
count_sym_han = ['달러', '원', '파운드', '엔', '유로', '도씨', '퍼센트']
# To be used in the training # Not now
import numpy as np
import hgtk
import han2one_rev
from han2one_rev import shin_onehot
alp = han2one_rev.alp
uniquealp = han2one_rev.uniquealp
def featurize_shin(corpus,maxlen):
rnn_shin = np.zeros((len(corpus),maxlen*3,len(alp)))
for i in range(len(corpus)):
if i%1000 ==0:
print(i)
s = corpus[i]
for j in range(len(s)):
if j < maxlen and hgtk.checker.is_hangul(s[-j-1])==True:
if j>0:
rnn_shin[i][-3*j-3:-3*j,:] = np.transpose(shin_onehot(s[-j-1]))
else:
rnn_shin[i][-3*j-3:,:] = np.transpose(shin_onehot(s[-j-1]))
return rnn_shin
# rec_shin = featurize_shin(dataset,30)
### Utils on English reading
import string
import re
small = {}
for i in range(len(string.ascii_lowercase)):
small.update({string.ascii_lowercase[i]:i})
big = {}
for i in range(len(string.ascii_uppercase)):
big.update({string.ascii_uppercase[i]:i+26})
def real_latin(term):
if hgtk.checker.is_latin1(term) and (term[0] in small or term[0] in big):
return True
else:
return False
vowels = ['a','e','i','o','u']
alpha = ['에이','비','씨','디','이','에프','쥐','에이치','아이','제이','케이','엘','엠','엔','오','피','큐','알','에스','티','유','브이','더블유','엑스','와이','지']
def decide_acronym(term):
if len(term) == 1:
return True
elif sum([int((z in big)) for z in term]) == len(term):
return True
elif sum([int((z.lower() in vowels)) for z in term]) == 0:
return True
else:
return False
def read_acronym(term):
return ('').join([alpha[int(small[z.lower()])] for z in term])
### Utils on number reading in Korean and English
kor_num0 = ['','하나','둘','셋','넷','다섯','여섯','일곱','여덟','아홉']
kor_num1 = ['열','스물','서른','마흔','쉰','예순','일흔','여든','아흔']
kor_cnt0 = ['','한','두','세','네','다섯','여섯','일곱','여덟','아홉']
count_noun = ['개','번째','번','살','시','걸음']
bbong_noun = ['이상','이하','초과','미만']
def makeCountKor(n):
x, y = divmod(n,10)
if x < 1:
if y == 0:
return '영'
else:
return kor_cnt0[y]
else:
return kor_num1[x-1]+kor_cnt0[y]
def makeBbongKor(n):
x = int(np.floor(n/10))
y = int(n - 10*x)
if x < 1:
if y == 0:
return '영'
else:
return kor_num0[y]
else:
return kor_num1[x-1]+kor_num0[y]
def readNumberKor(n,meta):
if meta in count_noun and n<100:
return makeCountKor(n)
elif meta in bbong_noun and n<100:
#return makeBbongKor(n)
return readNumber(n)
else:
return readNumber(n)
eng_num0 = ['','원','투','쓰리','포','파이브','식스','세븐','에잇','나인']
eng_num1 = ['텐','트웬티','써티','포티','피프티','식스티','세븐티','에잇티','나인티']
eng_numt = ['','일레븐','투웰브','써틴','포틴','피프틴','식스틴','세븐틴','에잇틴','나인틴']
eng_read = ['오','원','투','쓰리','포','파이브','식스','세븐','에잇','나인']
def readNumberEng(n):
x = int(np.floor(n/10))
y = int(n - 10*x)
if x < 1:
return eng_num0[y]
elif x < 10:
if x < 2:
if y == 0:
return eng_num1[y]
else:
return eng_numt[y]
else:
return eng_num1[x-1]+eng_num0[y]
else:
seq = [eng_read[int(z)] for z in str(n)]
return ('').join(seq)
## Refer to: https://soooprmx.tistory.com/entry/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%88%AB%EC%9E%90%EB%A5%BC-%ED%95%9C%EA%B8%80%EB%A1%9C-%EC%9D%BD%EB%8A%94-%ED%95%A8%EC%88%98
def readNumber(n):
units = [''] + list('십백천만')
nums = '일이삼사오육칠팔구'
result = []
i = 0
while n > 0:
n, r = divmod(n, 10)
if r > 0:
result.append(units[i])
if r > 1:
result.append(nums[r-1])
i += 1
return ''.join(result[::-1])
def readBigNum(n):
units = [''] + list('만억조경해자양구간정재극')
nums = '일이삼사오육칠팔구'
result = []
i = 0
while n > 0:
n, r = divmod(n, 10000)
if r > 0:
result.append(readNumber(r)+units[i])
i += 1
return ''.join(result[::-1])
def readOnlyNum(n):
nums = '영일이삼사오육칠팔구'
read = [nums[int(z)] for z in str(n)]
return ''.join(read)