Skip to content

Commit

Permalink
1st
Browse files Browse the repository at this point in the history
  • Loading branch information
laptop authored and laptop committed May 21, 2020
1 parent 914881d commit 8e9f97e
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 0 deletions.
34 changes: 34 additions & 0 deletions get_subs_from_one_channel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from youtube_transcript_api import YouTubeTranscriptApi
import urllib3
from collections import Counter
def get_videoids(channel_url):
#https://stackoverflow.com/questions/15512239/python-get-all-youtube-video-urls-of-a-channel
r=http.request("GET",channel_url)
url_data=str(r.data.decode("utf-8")).split()
item= 'href="/watch?'
vids = [line.replace('href="', 'youtube.com') for line in url_data if item in line]
vids=list(set([i[i.find("=")+1:i.find('"')] for i in vids]))
return vids

def get_subs(video_id):
text=""
subs=YouTubeTranscriptApi.get_transcript(video_id)
for i in subs:
text+=i["text"]
return text
user_agent={'user-agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}
http=urllib3.PoolManager(num_pools=1,headers=user_agent)
channel_url=input("Введите ссылку на канал: ")
#получаем все VideoID
vids=get_videoids(channel_url)
print(vids)
print("Получили {} ссылок на видео".format(len(vids)))
text=""
for i in vids:
text=text+get_subs(i)
text=text.encode("ascii", "ignore").decode("utf-8")
text=text.split()
with open("output.txt","w") as f:
for i,j in Counter(text).most_common():
print(i,j,file=f)
print("Сохранили в output.txt")
79 changes: 79 additions & 0 deletions output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
(' ', 54861)
('e', 30580)
('t', 27850)
('o', 22359)
('a', 20689)
('i', 18014)
('n', 17932)
('s', 15942)
('h', 13499)
('r', 13296)
('l', 11357)
('d', 9334)
('u', 8771)
('y', 7526)
('m', 7368)
('c', 6496)
('g', 6270)
('w', 5875)
('f', 5328)
('p', 4838)
('b', 3912)
('k', 3503)
('I', 2964)
('v', 2716)
("'", 2667)
(',', 1563)
('.', 1401)
('\n', 1302)
('j', 587)
('x', 489)
('-', 432)
('A', 307)
('T', 283)
('0', 264)
('S', 256)
('z', 217)
('q', 193)
('M', 189)
('N', 165)
('1', 140)
('W', 125)
('B', 108)
('Y', 105)
('(', 104)
(')', 104)
('3', 97)
('?', 88)
('G', 85)
('[', 83)
(']', 83)
('O', 81)
('C', 74)
('2', 72)
('D', 68)
('F', 60)
('L', 59)
('5', 57)
('J', 57)
('"', 53)
('H', 48)
('P', 48)
('R', 41)
('!', 35)
('E', 34)
('K', 32)
(':', 29)
('9', 28)
('V', 27)
('%', 23)
('4', 23)
('6', 18)
('8', 17)
('7', 15)
('U', 12)
('/', 8)
('$', 8)
(';', 5)
('Z', 4)
('X', 4)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
youtube_transcript_api
urllib3

0 comments on commit 8e9f97e

Please sign in to comment.