-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_tedxvid.py
executable file
·60 lines (48 loc) · 1.79 KB
/
parse_tedxvid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Class to parse TEDx Videos from youtube
# Copyright, amitch@rajgad.com, 2011
import sys;
from heapq import *;
class ParseTEDxVideos:
vidfile = ''
def __init__(self, vidfile=None):
if vidfile is not None:
self.vidfile = vidfile
self.file = open(vidfile, "r")
def gettopconf(self, count=10, conflist=None):
conferences = {}
topconf = []
if conflist is None:
conflist = self.file
# Each line is a conference, store in a directory, increasing counts subsequently
for line in conflist:
conference = line.split()[0]
if conferences.has_key(conference):
conference_item = conferences[conference]
conference_item[0] += 1
conferences[conference] = conference_item
else:
conferenceuri = line.split(',')[2]
conferences[conference] = [1,conferenceuri]
# Put these in heapq, using negative of video count to remove top ones
h = []
for conference in conferences.items():
heappush(h, (-1 * conference[1][0], conference[0], conference[1][1]))
# Remove count
for i in range(count):
talkscount, conference, conferenceuri = heappop(h)
topconf.append('%s,%d,%s' %(conference, -1*talkscount, conferenceuri))
return topconf
# -----------------------------------------------------------------------------------------------
# The main entry point
def main():
vidfile ='tedxtalks-vids-1k.txt'
count = 11
if len(sys.argv) > 1:
vidfile = sys.argv[1]
if len(sys.argv) > 2:
count = sys.argv[2]
parsevid = ParseTEDxVideos(vidfile)
for conf in parsevid.gettopconf(int(count)):
print conf
if __name__ == "__main__":
main()