-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
92 lines (71 loc) · 2.79 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import subprocess
import sys
import re
import whisper
from datetime import datetime
def sanitize_title(title):
return "".join(x for x in title if x.isalnum() or x in " _-").rstrip()[:50]
def download_audio(video_url):
output_dir = "extracted_audio"
os.makedirs(output_dir, exist_ok=True)
# Download video info to get the title
command = f'yt-dlp --get-title {video_url}'
video_title = subprocess.check_output(command, shell=True).decode().strip()
sanitized_title = sanitize_title(video_title)
# Check if a file starting with the sanitized title already exists
if any(f.startswith(sanitized_title) for f in os.listdir(output_dir)):
print(f"Skipping download, audio for '{video_title}' already exists.")
return os.path.join(output_dir, f"{sanitized_title}.mp3")
# Download audio
output_template = os.path.join(output_dir, "%(title)s.%(ext)s")
command = (
f'yt-dlp --no-part -x --audio-format mp3 -o "{output_template}" {video_url}'
)
subprocess.run(command, shell=True)
return os.path.join(output_dir, f"{sanitized_title}.mp3")
def remove_timestamps(text):
return re.sub(r'\[.*?\]', '', text)
def remove_special_characters(text):
return re.sub(r'[^A-Za-z0-9\s\.\,\?\!]', '', text)
def remove_duplicate_lines(text):
lines = text.split('\n')
seen = set()
result = []
for line in lines:
if line not in seen:
seen.add(line)
result.append(line)
return '\n'.join(result)
def transcribe_audio(audio_path):
output_folder = "transcripts"
if not os.path.exists(output_folder):
os.makedirs(output_folder)
model = whisper.load_model("medium")
filename = os.path.basename(audio_path)
output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")
result = model.transcribe(audio_path)
with open(output_path, "w", encoding="utf-8") as f:
for segment in result["segments"]:
start_time = datetime.utcfromtimestamp(segment["start"]).strftime(
"%Y-%m-%d %H:%M:%S"
)
text = segment["text"].strip()
cleaned_text = remove_duplicate_lines(
remove_special_characters(remove_timestamps(text))
)
f.write(f"{start_time}: {cleaned_text}\n")
if not cleaned_text:
f.write("\n")
print(f"Transcribed: {filename}")
return output_path
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <youtube_url>")
sys.exit(1)
video_url = sys.argv[1]
print("Downloading audio...")
audio_path = download_audio(video_url)
print("Transcribing audio...")
transcript_path = transcribe_audio(audio_path)
print(f"Transcript saved to {transcript_path}")