-
Notifications
You must be signed in to change notification settings - Fork 95
/
preprocess_audio.py
109 lines (93 loc) · 3.85 KB
/
preprocess_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
VCTK
https://datashare.ed.ac.uk/handle/10283/3443
VCTK trim info
https://github.com/nii-yamagishilab/vctk-silence-labels
Warning! This code is not properly debugged.
It is recommended to run it only once for the initial state of the audio file (flac or wav).
If executed repeatedly, consecutive application of "trim" may potentially damage the audio file.
>>> $ pip install librosa==0.9.2 numpy==1.23.5 scipy==1.9.1 tqdm # [option]
>>> $ cd /path/to/the/your/vits2
>>> $ ln -s /path/to/the/VCTK/* DUMMY2/
>>> $ git clone https://github.com/nii-yamagishilab/vctk-silence-labels filelists/vctk-silence-labels
>>> $ python preprocess_audio.py --filelists <~/filelist.txt> --config <~/config.json> --trim <~/info.txt>
"""
import argparse
import os
import librosa
import numpy as np
from scipy.io import wavfile
from tqdm.auto import tqdm
import utils
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--filelists",
nargs="+",
default=[
"filelists/vctk_audio_sid_text_test_filelist.txt",
"filelists/vctk_audio_sid_text_val_filelist.txt",
"filelists/vctk_audio_sid_text_train_filelist.txt",
],
)
parser.add_argument("--config", default="configs/vctk_base2.json", type=str)
parser.add_argument(
"--trim",
default="filelists/vctk-silence-labels/vctk-silences.0.92.txt",
type=str,
)
args = parser.parse_args()
with open(args.trim, "r", encoding="utf8") as f:
lines = list(filter(lambda x: len(x) > 0, f.read().split("\n")))
trim_info = {}
for line in lines:
line = line.split(" ")
trim_info[line[0]] = (float(line[1]), float(line[2]))
hps = utils.get_hparams_from_file(args.config)
for filelist in args.filelists:
print("START:", filelist)
with open(filelist, "r", encoding="utf8") as f:
lines = list(filter(lambda x: len(x) > 0, f.read().split("\n")))
for line in tqdm(lines, total=len(lines), desc=filelist):
src_filename = line.split("|")[0]
if not os.path.isfile(src_filename):
if os.path.isfile(src_filename.replace(".wav", "_mic1.flac")):
src_filename = src_filename.replace(".wav", "_mic1.flac")
else:
continue
if src_filename.endswith("_mic1.flac"):
tgt_filename = src_filename.replace("_mic1.flac", ".wav")
else:
tgt_filename = src_filename
basename = os.path.splitext(os.path.basename(src_filename))[0].replace(
"_mic1", ""
)
if trim_info.get(basename) is None:
print(
f"file info: '{src_filename}' doesn't exist in trim info '{args.trim}'"
)
continue
start, end = trim_info[basename][0], trim_info[basename][1]
# warning: it could be make the file to unacceptable
y, _ = librosa.core.load(
src_filename,
sr=hps.data.sampling_rate,
mono=True,
res_type="scipy",
offset=start,
duration=end - start,
)
# y, _ = librosa.effects.trim(
# y=y,
# frame_length=4096,
# hop_length=256,
# top_db=35,
# )
if y.shape[-1] < hps.train.segment_size:
continue
y = y * hps.data.max_wav_value
wavfile.write(
filename=tgt_filename,
rate=hps.data.sampling_rate,
data=y.astype(np.int16),
)