forked from coqui-ai/TTS
-
Notifications
You must be signed in to change notification settings - Fork 1
/
anonymizer.py
92 lines (77 loc) · 4.71 KB
/
anonymizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from typing import Union
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
from encodec.utils import convert_audio
from TTS.tts.layers.bark.inference_funcs import semantic_tokens_from_audio, load_voice
import torch
import torchaudio
from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
class Anonymizer(torch.nn.Module):
def __init__(self, checkpoint_dir: str, voice_dirs: Union[list[str], None] = None):
super().__init__()
if not os.path.exists(checkpoint_dir):
print(f"Checkpoint directory {checkpoint_dir} not found, creating it")
os.makedirs(checkpoint_dir)
# 1. initialize Bark
config = BarkConfig() # don't change the custom config for the love of god
self.model = Bark.init_from_config(config)
self.model.load_checkpoint(config, checkpoint_dir=checkpoint_dir, eval=True)
# self.model.to('cuda')
# 2. initialize the awesome, bark-distilled, unlikely-yet-functioning audio tokenizer
hubert_manager = HubertManager()
hubert_manager.make_sure_tokenizer_installed(model_path=self.model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
self.hubert_model = CustomHubert(
checkpoint_path=self.model.config.LOCAL_MODEL_PATHS["hubert"]) # .to(self.model.device)
self.tokenizer = HubertTokenizer.load_from_checkpoint(
self.model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"], map_location=self.model.device
)
self.voice_dirs = voice_dirs
self.sample_rate = self.model.config.sample_rate
def forward(
self,
audio: Union[torch.Tensor, str],
target_voice_id: str = 'random',
coarse_temperature: float = 0.7
):
# You can give the audio as path to a wav. In this case, resampling and reshaping is done
# If you directly give a tensor: must be 1 channel, 24k sr, and shape (1, L)
# batched inference is currently not supported, sorry
if isinstance(audio, str):
audio, sr = torchaudio.load(audio)
audio = convert_audio(audio, sr, self.model.config.sample_rate, self.model.encodec.channels)
audio = audio.to(
self.model.device) # there used to be an unsqueeze here but then they squeeze it back so it's useless
# 1. Extraction of semantic tokens
semantic_vectors = self.hubert_model.forward(audio, input_sample_hz=self.model.config.sample_rate)
semantic_tokens = self.tokenizer.get_token(semantic_vectors)
semantic_tokens = semantic_tokens.cpu().numpy() # they must be shifted to cpu
# this probably slows things down, but the following api function from bark specifically requires numpy
# but i mean, what the fuck do i know
# 2. Load voice as a history prompt as a tuple (semantic_prompt, coarse_prompt, fine_prompt)
if not self.voice_dirs:
assert target_voice_id == 'random', """If no voice dirs are given, the target voice must be 'random'.
Note that, regardless of this, 'random' always means 'use an empty semantic and coarse prompts'.
So even if target_voice_id == 'random', the voice_dirs will be ignored (it does NOT mean it will pick a
random voice from there).
...this should probably go into some documentation. Why am I writing it here?"""
history_prompt = load_voice(self.model, target_voice_id, self.voice_dirs)
# 3. Regression of acoustic tokens with bark api
# 'temp' here is only the coarse temperature. The fine temperature is internally fixed to 0.5
# (i fiddled with it a bit and it does seem a bit of a sweet spot, any higher and the audio gets a bit dirty)
# the other two returned values are coarse and fine tokens, we don't need them for now
audio_arr, _, _ = self.model.semantic_to_waveform(
semantic_tokens, history_prompt=history_prompt, temp=coarse_temperature
)
return audio_arr
checkpoint_dir = '/homes/panariel/.local/share/tts/tts_models--multilingual--multi-dataset--bark'
# checkpoint_dir = 'pretrained_models_dumpster/tts_models--multilingual--multi-dataset--bark' # this also works
# anonymizer = Anonymizer(checkpoint_dir)
# anonymizer.to('cuda')
#
# print('Done initializing')
# print(f'\tBark is on {anonymizer.model.device}')
# print(f'\tCustomHubert is on {anonymizer.hubert_model.model.feature_projection.projection.weight.device}')
# print(
# f'\tHubertTokenizer is on {anonymizer.tokenizer.fc.weight.device} (and btw it\'s version {anonymizer.tokenizer.version})')