From 8e9f97e60717dbb19fc9e2e810a080e5897bf495 Mon Sep 17 00:00:00 2001 From: laptop Date: Thu, 21 May 2020 14:30:41 +0500 Subject: [PATCH] 1st --- get_subs_from_one_channel.py | 34 ++++++++++++++++ output.txt | 79 ++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + 3 files changed, 115 insertions(+) create mode 100644 get_subs_from_one_channel.py create mode 100644 output.txt create mode 100644 requirements.txt diff --git a/get_subs_from_one_channel.py b/get_subs_from_one_channel.py new file mode 100644 index 0000000..dc03a34 --- /dev/null +++ b/get_subs_from_one_channel.py @@ -0,0 +1,34 @@ +from youtube_transcript_api import YouTubeTranscriptApi +import urllib3 +from collections import Counter +def get_videoids(channel_url): + #https://stackoverflow.com/questions/15512239/python-get-all-youtube-video-urls-of-a-channel + r=http.request("GET",channel_url) + url_data=str(r.data.decode("utf-8")).split() + item= 'href="/watch?' + vids = [line.replace('href="', 'youtube.com') for line in url_data if item in line] + vids=list(set([i[i.find("=")+1:i.find('"')] for i in vids])) + return vids + +def get_subs(video_id): + text="" + subs=YouTubeTranscriptApi.get_transcript(video_id) + for i in subs: + text+=i["text"] + return text +user_agent={'user-agent':'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'} +http=urllib3.PoolManager(num_pools=1,headers=user_agent) +channel_url=input("Введите ссылку на канал: ") +#получаем все VideoID +vids=get_videoids(channel_url) +print(vids) +print("Получили {} ссылок на видео".format(len(vids))) +text="" +for i in vids: + text=text+get_subs(i) +text=text.encode("ascii", "ignore").decode("utf-8") +text=text.split() +with open("output.txt","w") as f: + for i,j in Counter(text).most_common(): + print(i,j,file=f) +print("Сохранили в output.txt") \ No newline at end of file diff --git a/output.txt b/output.txt new file mode 100644 index 0000000..fd40cc6 --- /dev/null +++ b/output.txt @@ -0,0 +1,79 @@ +(' ', 54861) +('e', 30580) +('t', 27850) +('o', 22359) +('a', 20689) +('i', 18014) +('n', 17932) +('s', 15942) +('h', 13499) +('r', 13296) +('l', 11357) +('d', 9334) +('u', 8771) +('y', 7526) +('m', 7368) +('c', 6496) +('g', 6270) +('w', 5875) +('f', 5328) +('p', 4838) +('b', 3912) +('k', 3503) +('I', 2964) +('v', 2716) +("'", 2667) +(',', 1563) +('.', 1401) +('\n', 1302) +('j', 587) +('x', 489) +('-', 432) +('A', 307) +('T', 283) +('0', 264) +('S', 256) +('z', 217) +('q', 193) +('M', 189) +('N', 165) +('1', 140) +('W', 125) +('B', 108) +('Y', 105) +('(', 104) +(')', 104) +('3', 97) +('?', 88) +('G', 85) +('[', 83) +(']', 83) +('O', 81) +('C', 74) +('2', 72) +('D', 68) +('F', 60) +('L', 59) +('5', 57) +('J', 57) +('"', 53) +('H', 48) +('P', 48) +('R', 41) +('!', 35) +('E', 34) +('K', 32) +(':', 29) +('9', 28) +('V', 27) +('%', 23) +('4', 23) +('6', 18) +('8', 17) +('7', 15) +('U', 12) +('/', 8) +('$', 8) +(';', 5) +('Z', 4) +('X', 4) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9b9ac0a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +youtube_transcript_api +urllib3 \ No newline at end of file