-
Notifications
You must be signed in to change notification settings - Fork 3
/
yor_processor.py
197 lines (167 loc) · 7.04 KB
/
yor_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import math
import os
from random import randint, uniform
import re
import numpy as np
import wave
import contextlib
import re
# MAX_SENTENCE_LEN = 20
# SOURCE_PATH = 'yor_trans.txt'
# DEST_TEMP_PATH = 'yor_split.txt'
# DEST_TEMP_CLEAN_PATH = 'yor_clean_split.txt'
# NUM_FOLDER_SPLIT = 50
# FOLDER_PATH="split_text"
# PARTIAL_NAME= "yor_split"
def split_file(source_path, dest_path, max_sentence_len, end_of_file="## .\n"):
f = open(source_path, 'r')
new_f = open(dest_path, 'w')
for line in f:
line = line.lower().replace('\n', '')
splitted_text = line.split(" ")
total_words = len(splitted_text)
if total_words > 1:
if total_words <= max_sentence_len:
line = line + " " + end_of_file
new_f.write(line)
else:
# sentence longer than "max_sentence_len"
num_split = math.ceil(total_words/max_sentence_len)
for i in range(num_split):
cut_sentence = splitted_text[i*max_sentence_len: (i+1)*max_sentence_len]
cut_sentence.append(end_of_file)
cut_sentence = " ".join(cut_sentence)
new_f.write(cut_sentence)
f.close()
new_f.close()
return "done"
def split_file_into_folders(folder_path, source_path, partial_file_name, num_folder_split):
f = open(source_path, 'r')
count = 0
text = ""
for idx, line in enumerate(f):
text = text+line
if idx % (num_folder_split-1) == 0 and idx!=0:
file_name = folder_path+"/"+partial_file_name+"_"+str(count)+".txt"
with open(file_name, 'w') as new_split_f:
new_split_f.write(text)
text = ""
count += 1
def calculate_recording_len(path="./recordings/", file_format=".wav"):
total = 0
corrupted=0
files=0
for directory in os.listdir(path):
if os.path.isdir(path+directory):
for file in os.listdir(path+directory):
if file.endswith(file_format):
files+=1
fname = path+directory+"/"+file
try:
with contextlib.closing(wave.open(fname,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
total+=duration
except Exception as e:
corrupted+=1
return total, files, corrupted
def extract_non_corrupted_files(path="./recordings/"):
# Extracting all the non-corrupted files
wav_files = []
all_linkers = dict()
for directory in os.listdir("./recordings/"): # parent directory of the recordings, it should contain folders that contain wav,json and txt files
if os.path.isdir("./recordings/"+directory):
linker = []
to_remove = []
linker_file = [file for file in os.listdir("./recordings/"+directory) if file.endswith(".txt")][0]
linker_data = open("./recordings/"+directory+"/"+linker_file).readlines()
for file in os.listdir("./recordings/"+directory):
if file.endswith(".wav"):
fname = "./recordings/"+directory+"/"+file
try:
with contextlib.closing(wave.open(fname,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
wav_files.append(fname)
except Exception as e:
to_remove.append(file)
for file in to_remove:
i=0
while i<len(linker_data):
if file in linker_data[i]:
linker_data.pop(i)
i+=1
linker.extend(linker_data)
all_linkers[directory] = linker
return wav_files, all_linkers
def split_train_val_test(wav_files, num_splits, val_split):
np.random.seed(0)
all_indices = np.random.permutation(len(wav_files))
# calculate length of splits
if num_splits > 2:
index_start = len(all_indices)//num_splits
else:
index_start = 0
indices = all_indices[index_start:]
if num_splits > 2:
extra_idx = all_indices[:index_start]
test_idx = indices[:len(indices)//2]
validation_portion = int(len(indices)//2 * val_split)
train_idx = indices[len(indices)//2:-validation_portion]
valid_idx = indices[-validation_portion:]
# prepare paths for copy
extra_files = []
if num_splits > 2:
extra_files = [wav_files[i].split("/")[-1] for i in extra_idx]
train_set_files = [wav_files[i].split("/")[-1] for i in train_idx]
valid_set_files = [wav_files[i].split("/")[-1] for i in valid_idx]
test_set_files = [wav_files[i].split("/")[-1] for i in test_idx]
# copy to folders
if len(extra_files) >= 1:
to_copy_extra = "./data/records/" + " ./data/records/".join(extra_files)
else:
to_copy_extra = []
to_copy_train = "./data/records/" + " ./data/records/".join(train_set_files)
to_copy_valid = "./data/records/" + " ./data/records/".join(valid_set_files)
to_copy_test = "./data/records/" + " ./data/records/".join(test_set_files)
return to_copy_train, to_copy_valid, to_copy_test, to_copy_extra
def create_char_set(linkers, path="./split_text/", exclude="",):
chars = {" ":1, "ε":0}
text_data = []
char_idx = 2
for section_id, linker in linkers.items():
text_file = open(path+section_id,"r").readlines() # Modify this to the directory of your txt file that you recorded with
for link in linker:
#print(link)
file, idx = link.split(":")[0],int(link.split(":")[1].split(" ")[1])-1
line = text_file[idx]
line = line.split("##")[0].strip()
line = re.sub(exclude, "", line) # Clean unnecessary characters from the data, this is for arabic
text_data.append((line,file.split(".")[0]))
char_set = set(line)
for c in char_set:
if c not in chars:
chars[c]=char_idx
char_idx+=1
return chars, text_data
def create_data_format(text_data, chars_dict):
raw_text = "\n".join([wav+":"+line for line, wav in text_data])
with open("data/raw_text_file.txt","w") as f:
f.write(raw_text)
indices_text = []
for line, wav in text_data:
line = list(line)
indices = []
for c in line:
indices.append(str(chars_dict[c]))
indices_text.append(wav+" "+" ".join(indices))
indices_text = "\n".join(indices_text)
with open("data/chars.txt","w") as f:
f.write(indices_text)
with open("data/charset.txt", "w") as js:
js.write(str(chars_dict))
with open("data/charset.txt") as js:
charset = eval(js.read())
print("files created for training")