-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
129 lines (93 loc) · 3.84 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import json
from pytube import YouTube
from pytube.innertube import _default_clients
from pydub import AudioSegment
import replicate
from dotenv import load_dotenv
import argparse
# Load environment variables from .env file
load_dotenv()
# fix for youtube downloader
_default_clients["ANDROID_MUSIC"] = _default_clients["ANDROID_CREATOR"]
def download_video(video):
yt = YouTube(video)
print("Step 1/3. Downloading " + yt.title + "...")
# extract only audio
video = yt.streams.filter(only_audio=True).first()
# download the file
out_file = video.download(output_path='temp')
folder = yt.title
folder = ''.join(e for e in folder if e.isalnum() or e.isspace())
folder = folder.replace(" ", "_")
# shorten __ to _
folder = folder.replace("__", "_")
# Create the output folder
os.makedirs(folder, exist_ok=True)
# Rename the file to audio.mp4
new_file = 'complete.mp4'
new_file = os.path.join(folder, new_file)
# Save as mp4 first in folder
os.rename(out_file, new_file)
# Convert to WAV for better compatibility
audio = AudioSegment.from_file(new_file)
converted_file = os.path.join(folder, "complete.wav")
audio.export(converted_file, format="wav")
# result of success
print("Step 1/3 done. Successfully downloaded.")
return folder
def whisper_speech_to_text(folder):
print("Step 2/3. Running text-to-speech with speaker diarization using whisper...")
file_path = os.path.join(folder, "complete.mp4")
file = open(file_path, "rb")
inputs = {
"file": file,
}
output = replicate.run(
"thomasmol/whisper-diarization:b9fd8313c0d492bf1ce501b3d188f945389327730773ec1deb6ef233df6ea119",
input=inputs
)
transcription_file_path = os.path.join(folder, "transcription.json")
with open(transcription_file_path, "w") as f:
f.write(json.dumps(output, indent=4))
print("Step 2/3 done. Successfully ran text-to-speech with speaker diarization.")
def split_audio_into_speaker_parts(folder):
print("Step 3/3. Splitting audio into individual files for each speaker...")
# Load the audio file
file_path = os.path.join(folder, "complete.wav")
audio = AudioSegment.from_mp3(file_path)
transcription_file_path = os.path.join(folder, "transcription.json")
# Load the JSON output
with open(transcription_file_path) as f:
output = json.load(f)
# Parse the JSON output
segments = output['segments']
# Create a dictionary to store audio segments for each speaker
speaker_audio = {}
for segment in segments:
speaker = segment['speaker']
start = float(segment['start']) * 1000 # pydub works in milliseconds
end = float(segment['end']) * 1000
text = segment['text']
# Extract the segment
segment_audio = audio[start:end]
if speaker not in speaker_audio:
speaker_audio[speaker] = segment_audio
else:
speaker_audio[speaker] += segment_audio
# Export each speaker's audio to a file
for speaker, audio_segment in speaker_audio.items():
speaker_file_path = os.path.join(folder, f"{speaker}.mp3")
audio_segment.export(speaker_file_path, format="mp3")
print("Step 3/3 done. Successfully split audio into individual files for each speaker. Number of speakers: " + str(len(speaker_audio)))
print("Saved files to ./" + folder + "/")
def process_video(video_url):
folder = download_video(video_url)
whisper_speech_to_text(folder)
split_audio_into_speaker_parts(folder)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="downloads audio from a youtube url and splits the audio into speaker parts.")
parser.add_argument("url", type=str, help="the youtube url to process")
args = parser.parse_args()
process_video(args.url)