forked from gooofy/zamia-speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeech_transcripts.py
124 lines (91 loc) · 3.41 KB
/
speech_transcripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2016, 2017 Guenter Bartsch
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
# transcripts.csv i/o
#
# quality: 0=not reviewed, 1=poor, 2=fair, 3=good
from nltools.tokenizer import tokenize
class Transcripts(object):
def __init__(self, lang='de'):
self.lang = lang
self.ts = {}
with open('data/src/speech/%s/transcripts.csv' % self.lang, 'r') as f:
while True:
line = f.readline().rstrip().decode('utf8')
if not line:
break
parts = line.split(';')
# print repr(parts)
if len(parts) != 6:
raise Exception("***ERROR in transcripts: %s" % line)
cfn = parts[0]
dirfn = parts[1]
audiofn = parts[2]
prompt = parts[3]
ts = parts[4]
quality = int(parts[5])
spk = cfn.split('-')[0]
v = { 'cfn' : cfn,
'dirfn' : dirfn,
'audiofn' : audiofn,
'prompt' : prompt,
'ts' : ts,
'quality' : quality,
'spk' : spk}
self.ts[cfn] = v
def __len__(self):
return len(self.ts)
def __getitem__(self, key):
return self.ts[key]
def __iter__(self):
return iter(sorted(self.ts))
def __setitem__(self, key, v):
self.ts[key] = v
def __contains__(self, key):
return key in self.ts
def save(self):
with open('data/src/speech/%s/transcripts.csv' % self.lang, 'w') as f:
for cfn in sorted(self.ts):
v = self.ts[cfn]
f.write((u"%s;%s;%s;%s;%s;%d\n" % (cfn, v['dirfn'], v['audiofn'], v['prompt'], v['ts'], v['quality'])).encode('utf8'))
def split(self, p_test=5, limit=0, min_quality=2, add_all=False):
ts_all = {}
ts_train = {}
ts_test = {}
cnt = 0
for cfn in self.ts:
v = self.ts[cfn]
cnt += 1
if limit>0 and cnt>limit:
break
if v['quality'] < min_quality:
if ( v['quality'] != 0 ) or ( not add_all ):
continue
if len(v['ts']) == 0:
if add_all:
v['ts'] = ' '.join(tokenize(v['prompt']))
else:
print "WARNING: %s transcript missing" % cfn
continue
ts_all[cfn] = v
if len(ts_test) < (len(ts_all) * p_test / 100):
ts_test[cfn] = v
else:
ts_train[cfn] = v
return ts_all, ts_train, ts_test