-
Notifications
You must be signed in to change notification settings - Fork 0
/
aitm_A3_youtube_totalvideos_yearly.py
71 lines (55 loc) · 1.88 KB
/
aitm_A3_youtube_totalvideos_yearly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 17 19:06:32 2023
@author: Adam
"""
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from youtube_transcript_api import YouTubeTranscriptApi
API_KEY = 'AIzaSyAHjM4--WZZCqINCUgwbvSoM-K1iwPLpEA'
SEARCH_TERM = 'medicinal cannabis australia'
BASE_URL = 'https://www.googleapis.com/youtube/v3/search'
def get_video_ids_for_year(year):
start_date = datetime(year, 1, 1).strftime('%Y-%m-%dT00:00:00Z')
end_date = datetime(year, 12, 31).strftime('%Y-%m-%dT23:59:59Z')
params = {
'key': API_KEY,
'part': 'id',
'type': 'video',
'q': SEARCH_TERM,
'publishedAfter': start_date,
'publishedBefore': end_date,
'maxResults': 30000,
}
video_ids = []
nextPageToken = None
while True:
if nextPageToken:
params['pageToken'] = nextPageToken
response = requests.get(BASE_URL, params=params)
data = response.json()
video_ids.extend([item['id']['videoId'] for item in data['items']])
nextPageToken = data.get('nextPageToken', None)
if not nextPageToken:
break
return video_ids
def get_transcripts_for_year(year):
video_ids = get_video_ids_for_year(year)
transcripts = {}
for video_id in video_ids:
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
transcripts[video_id] = transcript
except Exception as e:
print(f"Error retrieving transcript for video ID {video_id}: {e}")
return transcripts
def main():
start_year = 2013
end_year = datetime.now().year
for year in range(start_year, end_year + 1):
transcripts = get_transcripts_for_year(year)
print(f'Total transcripts retrieved for {year}: {len(transcripts)}')
if __name__ == '__main__':
main()