diff --git a/examples/baker/configs/vits2_vocos_v1.json b/examples/baker/configs/vits2_vocos_v1.json index e23ad84..7144690 100644 --- a/examples/baker/configs/vits2_vocos_v1.json +++ b/examples/baker/configs/vits2_vocos_v1.json @@ -23,7 +23,7 @@ "filter_length": 1024, "hop_length": 256, "win_length": 1024, - "n_mel_channels": 80, + "n_mel_channels": 100, "mel_fmin": 0.0, "mel_fmax": null }, diff --git a/wetts/vits/data_utils.py b/wetts/vits/data_utils.py index 2fad966..4886a82 100644 --- a/wetts/vits/data_utils.py +++ b/wetts/vits/data_utils.py @@ -19,7 +19,6 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset): def __init__(self, audiopaths_sid_text, hparams): self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text) self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.win_length = hparams.win_length @@ -125,8 +124,6 @@ def get_audio(self, filename): self.sampling_rate, self.hop_length, self.win_length, - self.mel_fmin, - self.mel_fmax, center=False, ) spec = torch.squeeze(spec, 0) diff --git a/wetts/vits/export_onnx.py b/wetts/vits/export_onnx.py index c078616..fd91f33 100644 --- a/wetts/vits/export_onnx.py +++ b/wetts/vits/export_onnx.py @@ -65,7 +65,7 @@ def main(): if ("use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder): print("Using mel posterior encoder for VITS2") - posterior_channels = 80 # vits2 + posterior_channels = hps.data.n_mel_channels # vits2 hps.data.use_mel_posterior_encoder = True else: print("Using lin posterior encoder for VITS1") diff --git a/wetts/vits/inference.py b/wetts/vits/inference.py index 049b6ff..afcf308 100644 --- a/wetts/vits/inference.py +++ b/wetts/vits/inference.py @@ -68,7 +68,7 @@ def main(): if ("use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder): print("Using mel posterior encoder for VITS2") - posterior_channels = 80 # vits2 + posterior_channels = hps.data.n_mel_channels # vits2 net_g = SynthesizerTrn(len(phone_dict), posterior_channels, hps.train.segment_size // hps.data.hop_length, diff --git a/wetts/vits/train.py b/wetts/vits/train.py index dd1d7a2..ec9c553 100644 --- a/wetts/vits/train.py +++ b/wetts/vits/train.py @@ -47,7 +47,7 @@ def main(): if ("use_mel_posterior_encoder" in hps.model.keys() and hps.model.use_mel_posterior_encoder): print("Using mel posterior encoder for VITS2") - posterior_channels = 80 # vits2 + posterior_channels = hps.data.n_mel_channels # vits2 hps.data.use_mel_posterior_encoder = True else: print("Using lin posterior encoder for VITS1")