-
Notifications
You must be signed in to change notification settings - Fork 0
/
Downloader.py
501 lines (423 loc) · 22.1 KB
/
Downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
## interface
''' Flow 1 backup entire channel.
- user puts a channel url. program (P) goes to the "all uploaded" playlists and downloads all
of them and puts them in "videos" folder, it will also download all descriptions, states and thumbnails url
then puts them accordingly in files with the same name as the video
Note some videos exist in a playlist but not in the 'all uploads' playlist!!!
This happens if a video link is 'not listed' or points to another channel's video.
Flow 2 download a single playlist - TODO
'''
## Glossary
'''
AUP : all uploads playlist
mVideo: muted video,adaptive video it doesn't come with sound aka adaptive_video
video: has both video and audio combined
playlist: the video playlist whose url we passed
'''
from genericpath import exists
from pytube import YouTube
from pytube.extract import channel_name, publish_date
from pytube.request import get, head
from Scrapper import ChannelScrapper
from Logger import Log
from Custom_Exceptions import VideoExistsError
from utils import get_now_date, compare_dicts, get_days_between_dates, over_write_json_file,read_json_file, Keys, remove_dir_illegal_chars, generate_playlist_url, Quality
import os
from sys import stdout, exit
from signal import signal, SIGINT
import shutil
import traceback
import re
import time
class Downloader():
def __init__(self, channel_url, max_update_lag = 3, browser_wait = 3, headless=False, root_path="youtube_backup", quality=Quality.HIGH):
## things needed for first initialization in order
self.root_path = root_path
self.init_root_dir()
log_file_path = os.path.join(self.root_path, "logfile.txt")
self.logger = Log(log_file_path) # the logger must follow the init root dir directly
self.log = self.logger.log #make an alias to the Log.log() function so I don't have to type it all the time
self.scrapper = ChannelScrapper(channel_url, self.logger, headless=headless, default_wait=browser_wait)
self.channel_name = remove_dir_illegal_chars(fr"{self.scrapper.get_channel_name()}")
self.handle_null_channel_name(browser_wait, channel_url)
signal(SIGINT, self.graceful_exit)
#######
## init paths
self.channel_path = os.path.join(self.root_path, self.channel_name)
self.info_path = os.path.join(self.channel_path, "info")
self.playlists_path = os.path.join(self.info_path, "playlists")
self.current_video_output_path = ""
####
self.init_dirs()
## init urls
self.channel_url = channel_url
self.all_uploads_url = channel_url + "/videos" # example https://www.youtube.com/user/FireSymphoney/videos
# ?view=1 makes us go to created playlists most of the time
self.playlists_url = channel_url + "/playlists?view=1" # https://www.youtube.com/user/FireSymphoney/playlists
self.about_url = channel_url + "/about"
####
self.current_video_information = {}
self.failed_videos = {} # a title:url record of all failed downloads
self.download_in_progress = False
self.allow_download = True
self.max_update_lag = max_update_lag # scrape the channel if the current json record is more than x days old, put zero to scrape the channel once regardless of the freshness of the record
self.num_created_video_dirs = 0
self.all_playlists = {}
self.quality = quality
self.log(f'Bismillah! initialized a Download for channel {self.channel_name} at {self.channel_path}', print_log=True)
def handle_null_channel_name(self, browser_wait:int, channel_url:str):
if not self.channel_name or self.channel_name =="None":
pause_time = browser_wait+5
time.sleep(pause_time)
self.channel_name = remove_dir_illegal_chars(fr"{self.scrapper.get_channel_name()}") # try again
if not self.channel_name or self.channel_name =="None":
self.log(f"Fatal Error channel name is null for {channel_url} ", 'critical')
raise "Critical error not downloading {} because channel name was null, please try again"
def init_root_dir(self):
if not os.path.exists(self.root_path):
os.mkdir(self.root_path)
def init_dirs(self):
if not os.path.exists(self.channel_path):
os.mkdir(self.channel_path)
self.log(f'Created channel dir {self.channel_path}')
if not os.path.exists(self.info_path):
os.mkdir(self.info_path)
self.log(f'Created info dir {self.info_path}')
if not os.path.exists(self.playlists_path):
os.mkdir(self.playlists_path)
self.log(f'Created playlists dir {self.playlists_path}')
def graceful_exit(self, sig, frame):
self.log('You pressed Ctrl+C!')
self.allow_download = False
self.scrapper.__del__()
if (self.download_in_progress and self.current_video_output_path):
msg = "There was a download in progress during exit signal, will delete that video to avoid having courrupted files"
msg2 = f"will delete the last downloaded video at {self.current_video_output_path}"
self.log(msg, "warn")
self.log(msg2, "warn")
self.delete_file(self.current_video_output_path)
self.logger.exit()
exit(0)
def delete_file(self, path:str):
if path:
try:
shutil.rmtree(path)
msg = f"Delete complete {path}"
self.log(msg, "warn")
except OSError as e:
#TODO make this use the handle_error function
msg = "Error: %s - %s." % (e.filename, e.strerror)
self.log(msg, "warn")
def prepare_download_dir(self, video_title:str ) -> str:
'''Creates the file tree needed for the download and return the directory where the video
should be downloaded to.
If video exists it will raise VideoExistsError as the file is already there,
because it should have been deleted if there was a partial download,
otherwise will return the path dir of the video'''
video_title = remove_dir_illegal_chars(video_title)
video_path = os.path.join(self.channel_path, "videos", video_title)
if not os.path.exists(video_path):
self.log(f'Created video dir {video_path}')
os.makedirs(video_path)
return video_path
raise VideoExistsError
def write_downloaded_video_info_json(self)-> None:
'''Writes the video information to a json file at the self.current_video_output_path directory'''
try:
json_file_path = os.path.join(self.current_video_output_path, "info.json")
over_write_json_file(json_file_path, self.current_video_information)
self.log(f'Wrote the json file for video at {self.current_video_output_path}')
except Exception as e:
self.log(f"failed to write video info json object for {self.current_video_output_path}", level="error")
self.handle_exception(e)
def write_playlist_info_json(self, playlist_name:str, playlist_info:dict)-> None:
'''Writes the playlist information to a json file at the self.=playlists_dir directory'''
playlist_name = remove_dir_illegal_chars(playlist_name)
try:
json_file_path = os.path.join(self.playlists_path, f"{playlist_name}.json")
if os.path.isfile(json_file_path):
json_dict = read_json_file(json_file_path) # this is the old file
# keep a record of what videos the channel uploaded and what videos got removed
# this also records removed playlists and new playlists
last_update = json_dict.pop(Keys.DATEKEY) # the new dict will not have this, so remove it while we compare
added_stuff_since_last_backup, channel_removed_stuff = compare_dicts(json_dict, playlist_info )
json_dict[Keys.DATEKEY] = last_update # return it back after comparing both dicts
json_dict = {**playlist_info, **json_dict} # merge the two dicts, if there's overlapping keys, then use the one from the old file ie json_dict
json_dict[Keys.DATEKEY][str(get_now_date())] = {"new_entries_since_last_backup": added_stuff_since_last_backup, "removed_entries_since_last_backup":channel_removed_stuff}
json_dict[Keys.PLAYLIST_AVAILABLE_VIDEOS_NUMBER] = max(playlist_info[Keys.PLAYLIST_AVAILABLE_VIDEOS_NUMBER],json_dict[Keys.PLAYLIST_AVAILABLE_VIDEOS_NUMBER] ) # since we are incrementing videos we want the total number of videos we have locally
over_write_json_file(json_file_path, json_dict)
else:
playlist_info[Keys.DATEKEY] = {get_now_date(): "initial install"}
over_write_json_file(json_file_path, playlist_info)
self.log(f'Wrote the json file for {playlist_name} at {self.playlists_path}')
except Exception as e:
self.log(f"failed to write playlist info json object for {playlist_name} at {self.playlists_path}", level="error")
self.handle_exception(e)
def download_video(self, video_url:str, progressive=True) -> None:
'''progressive is limited to 720p max, but is both video and audio together, the other type
is called DASH aka adaptive, it is higher resolution but needs to download video and audio sepertely and
mix them after download'''
error_msg = f"Failed to download {video_url} with progressive={progressive}"
if(self.allow_download == False):
self.log(f"Download stopped, will not download {video_url}")
return
self.download_in_progress = True
try:
yt = YouTube(video_url)
self.current_video_information = {
"title" : yt.title,
"description" : yt.description ,
"rating" : yt.rating,
"views" : yt.views,
"publish_date" : yt.publish_date,
"length" : yt.length ,
"thumbnail_url" : yt.thumbnail_url,
"channel" : yt.author,
"video_id" : yt.video_id,
"date_of_download" : get_now_date(),
"video_url": video_url,
"keywords" : yt.keywords,
}
self.current_video_output_path = self.prepare_download_dir(fr"{yt.title}")
def download_callback(stream, file_path:str):
self.log(f'Downloaded {file_path}', print_log=True)
self.write_downloaded_video_info_json()
def progress_callback(stream, chunk, bytes_remaining):
filesize = stream.filesize
current = ((filesize - bytes_remaining)/filesize)
percent = ('{0:.1f}').format(current*100)
progress = int(50*current)
status = '█' * progress + '-' * (50 - progress)
stdout.write(' ↳ |{bar}| {video} | {percent}%\r'.format(bar=status,video=self.current_video_information["title"], percent=percent))
stdout.flush()
yt.register_on_complete_callback(download_callback)
yt.register_on_progress_callback(progress_callback)
if (progressive):
# self.log(f"Attempt to download {title} with url {video_url}")
if self.quality == Quality.HIGH:
yt.streams.get_highest_resolution().download(output_path=self.current_video_output_path)
else:
yt.streams.get_lowest_resolution().download(output_path=self.current_video_output_path)
except VideoExistsError as ve:
msg = f'''Video \"{self.current_video_information['title']}\" already exists, not downloading url {self.current_video_information['video_url']}'''
self.log(msg, level="warning")
except Exception as e:
self.log(error_msg, level="error")
self.handle_exception(e)
finally:
self.download_in_progress = False
if self.did_download_fail(self.current_video_output_path):
msg = f'''Failed to fully download \"{self.current_video_information['title']}\" will try again later {self.current_video_information['video_url']}'''
self.log(msg, level="warning")
self.failed_videos[self.current_video_information['title']] = self.current_video_information["video_url"]
self.log("deleting partial downloads if they exist")
self.delete_file(self.current_video_output_path)
def download_url_list(self, videos_url_list ):
'''Takes a videos url list and loops over them to download each of them, return true when it finishes, otherwise returns null'''
for link in videos_url_list:
self.download_video(link)
if not (self.allow_download):
break
return True
def download_specific_playlist(self, playlist_url:str):
'''This function is a simplified version of download_all_videos_from_channel
it scrapes and downloads a single playlist, it is only usable when there's a single playlist or a few
that you want to download, if you want to back up the entire channel please use download_all_videos_from_channel'''
self.log(f"Attempting to download playlist {playlist_url}")
# self.write_channel_info()
playlist_info = self.scrapper.get_playlist_info(playlist_url)
video_urls = []
for v in playlist_info[Keys.PLAYLIST_AVAILABLE_VIDEOS].values():
video_urls.append(v[Keys.URL])
self.write_playlist_info_json(playlist_info[Keys.PLAYLIST_NAME], playlist_info)
if self.download_url_list(video_urls):
self.__finish_download_and_show_stats()
else:
self.log("There was an error, download the playlist", level="error")
def download_specific_video(self, video_url:str):
'''this is simplified function for backing up a single video, don't use programmatically'''
self.log(f"Attempting to download video {video_url}")
# self.write_channel_info()
self.download_video(video_url)
def download_all_videos_from_channel(self)-> None:
self.log("Attempting to download All Uploads playlist")
all_videos_info = self.get_all_uploads_playlist_data()
num_vids = all_videos_info.pop(Keys.PLAYLIST_AVAILABLE_VIDEOS_NUMBER) # since we don't wanna iterate over the number of videos
all_videos_info.pop(Keys.DATEKEY) # since we don't wanna iterate over the date
all_urls_list = [v[Keys.URL] for v in all_videos_info.values()]
did_finish = self.download_url_list(all_urls_list)
if did_finish:
self.log(f"Finished going over all channel videos ---- {self.all_uploads_url}")
self.log("God bless you!")
else:
self.log(f"There was a problem and I couldn't download all the videos... total downloads should be {num_vids} for {self.all_uploads_url}")
self.__finish_download_and_show_stats()
def __finish_download_and_show_stats(self):
corrupted_downloads = self.validate_downloaded_videos() # this should be zero
self.log(f"There is a total of {self.num_created_video_dirs} new video directories for {self.channel_name}")
if len(corrupted_downloads) != 0:
self.log(f"There are {len(corrupted_downloads)} corrupted downloads that should be deleted and repeated, please run the downloader again. here is a list: ", "error")
self.log(str(corrupted_downloads))
raise "Corrupted files downloaded"
num_failed_downloads = len(self.failed_videos.keys())
if num_failed_downloads > 0:
self.log(f"{num_failed_downloads} videos failed to download, will try again")
self.__handle_failed_downloads()
def __handle_failed_downloads(self):
self.log("Trying to retry failed downloads")
list_of_urls = list(self.failed_videos.values())
self.failed_videos = {} # reset it to empty
self.download_url_list(list_of_urls)
num_failed_downloads = len(self.failed_videos.keys())
if num_failed_downloads > 0: # if we still have failed downloads we just write the file and move on
self.log(f"{num_failed_downloads} videos failed to download, will try again")
try:
json_file_path = os.path.join(self.info_path, "failed_video_downloads.json")
self.log(f"Failed again to download some videos, will record the list of failed downloads in this file and move on {json_file_path}", level="warning")
over_write_json_file(json_file_path, self.failed_videos)
self.log(f'Wrote the json file for failed videos at {json_file_path}')
except Exception as e:
self.log(f"failed to write failed videos info json object for {over_write_json_file}", level="error")
self.handle_exception(e)
def write_all_channel_playlists_info(self)-> None:
''' writes a json entry for each playlist which included all the videos in that playlist, this doesn't include the All Uploads playlist
note below is obsolete, /videos has all videos we need now, no need to extract the special playlist for all-uploads
'''
self.log("Getting all of the playlists information...", print_log=True)
if not self.all_playlists:
self.all_playlists = self.scrapper.get_all_channel_playlists_info(self.playlists_url)
# all_playlists_info.pop('num_playlists')
for playlist_id, playlist_metadata in self.all_playlists.items():
playlist_info = self.scrapper.get_playlist_info(generate_playlist_url(playlist_id))
self.write_playlist_info_json(playlist_metadata[Keys.PLAYLIST_NAME], playlist_info)
def get_all_uploads_playlist_data(self) -> dict :
'''checks if we already have the links we need as a json file, if we do will read that json file and return a dict,
otherwise will scrape the channel website then write the json file and finally return the channel info dict'''
try:
if self.should_update_json_record():
# scrape the website in this case
self.log(f"Current json is too old, will scrape the channel and create a new json")
self.write_channel_info()
self.write_all_channel_playlists_info() # record all the playlists of the channel
output = self.scrapper.get_all_uploads_info_for_channel(self.all_uploads_url)
output[Keys.PLAYLIST_AVAILABLE_VIDEOS_NUMBER] = len(output.keys())
self.write_playlist_info_json(Keys.ALL_UPLOADS_PLAYLIST_NAME , output)
return output
else:
# we shouldn't scrape the channel website, instead read the record directly from json
self.log(f"Using current json record of the channel since it is still not too old")
json_file_path = os.path.join(self.playlists_path, f"{Keys.ALL_UPLOADS_PLAYLIST_NAME}.json")
json_dict = read_json_file(json_file_path)
return json_dict
except Exception as e:
self.handle_exception(e)
def write_channel_info(self) -> None:
channel_info = {}
channel_about = self.scrapper.get_channel_about(self.about_url)
if not self.all_playlists:
self.all_playlists = self.scrapper.get_all_channel_playlists_info(self.playlists_url)
channel_info[Keys.CHANNEL_ABOUT] = channel_about
channel_info[Keys.URL] = self.channel_url
channel_info[Keys.DATEKEY] = {get_now_date(): "install - history is not supported for Channels about yet"}
channel_info[Keys.NUMBER_OF_PLAYLISTS] = len(self.all_playlists.keys())
# channel_info[Keys.CREATED_PLAYLISTS] = all_playlists
try:
json_file_path = os.path.join(self.info_path, "channel_info.json")
over_write_json_file(json_file_path, channel_info)
self.log(f'Wrote the json file for channel info at {json_file_path}')
except Exception as e:
self.log(f"failed to write channel info json object for {json_file_path}", level="error")
self.handle_exception(e)
def should_update_json_record(self) -> bool:
'''Checks if the current json records are fresh enough, if they are old returns true meaning we should update our records by scrapping'''
try:
playlists = os.listdir(self.playlists_path)
if(len(playlists) == 0):
return True
# all uploads playlist is downloaded last, so if it doesn't exist means that we didn't fetch all playlists and need to update
all_uploads_file_path = os.path.join(self.playlists_path, f"{Keys.ALL_UPLOADS_PLAYLIST_NAME}.json")
file_exists = os.path.isfile(all_uploads_file_path)
if not file_exists:
return True
for pl in playlists:
if (".json" in pl):
pl = os.path.join(self.playlists_path, pl)
json_dict = read_json_file(pl)
date = list(json_dict[Keys.DATEKEY].keys())[-1] #example: '03/11/2021 05:46:32'
date_now = get_now_date()
days_delta = get_days_between_dates(date, date_now)
if days_delta > self.max_update_lag:
return True
except Exception as e:
self.handle_exception(e)
return True # if we run into erros reading the files then scrape from online
return False # no need to scrape the website
def handle_exception(self, e:Exception) -> None:
self.log(repr(e), level="error")
traceback.print_exc()
def validate_downloaded_videos(self):
'''Will go to each downloaded video folder anc checks if we have the video and the info json file,
it will also check if the the size of the files if more than 0 bytes. If the checks pass returns an empty list,
otherwise returns a dict of all the video folders that failed'''
videos_dir = os.path.join(self.channel_path, "videos")
failed_downloads = []
for dir_path, _, _ in os.walk(videos_dir):
if dir_path == videos_dir:
continue # skip the iteration as this cannes /videos and we want /videos/actual_video
if self.did_download_fail(dir_path):
failed_downloads.append(dir_path)
self.num_created_video_dirs = self.num_created_video_dirs + 1
return failed_downloads
def clean_bad_downloads(self):
'''Will go to each downloaded video folder anc checks if we have the video and the info json file,
it will also check if the the size of the files if more than 0 bytes. If the checks fail will delete that bad download file.
This method is called manually only for clean up of old downloads, before the implemntation of the current checks that delete the video as soon as it is not valid.
So this function should be regarded as a manual job to ensure goodness of downloads made before the implementation of the validators
'''
self.log('Cleaning bad downloads up...')
videos_dir = os.path.join(self.channel_path, "videos")
for dir_path, _, _ in os.walk(videos_dir):
if dir_path == videos_dir:
continue # skip the iteration as this cannes /videos and we want /videos/actual_video
if self.did_download_fail(dir_path):
self.log("Clean up task, will delete {}".format(dir_path))
self.delete_file(dir_path)
def did_download_fail(self, video_dir:str):
'''Returns True if video download failed and needs to be repeated, otherwise returns '''
for dir_path, _, dir_files in os.walk(video_dir):
if not len(dir_files) >= 2: # if we have less than two files, which are the info.json and video.mp4
return True
if not "info.json" in dir_files: # if we dont have the info json file
return True
is_there_mp4_file = False
for f in dir_files:
if re.search(".mp4$", f): # see if there's .mp4 at the end of one of the files
is_there_mp4_file = True
file_path = os.path.join(dir_path, f)
if os.path.getsize(file_path) == 0:
return True
if not is_there_mp4_file:
return True
return False
def get_available_video_qualities():
pass
def format_video():
'''give user ability to change format to things like mp4, AVI, webm ...etc'''
pass
def format_audio():
pass
def get_highest_resolution_progressive():
''' progressive is when both the audio and vidoe are combined max quality is 720
in one file. Adaptive is when they are sepreate and it is of higher quality >720'''
pass
def get_highest_resolution_adaptive():
pass
def combine_adaptive_video_audio():
pass
def download_adaptive_video():
pass
def download_adpaptive_audio():
pass
def get_videos_in_both_AUP_and_playlist(all_uploads_playlist_urls, playlists_urls ):
'''returns the union of the AUP and the playlist urls'''
return set(all_uploads_playlist_urls) & set(playlists_urls)