forked from karpathy/researchpooler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nips_download_parse.py
117 lines (97 loc) · 4.37 KB
/
nips_download_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Standalone helper script.
Parses NIPS proceedings for years 2003-2010, creates list of dictionaries
that store information about each publication, and saves the result as a
pickle in current directory called pubs_nips.
"""
import urllib
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
from repool_util import savePubs
pubs = []
warnings = []
for num in range(16, 24):
year = 1987 + num
url = "http://books.nips.cc/nips%d.html" % (num,)
print "downloading proceedings from NIPS year %d..." % (year,)
f = urllib.urlopen(url)
s = f.read()
f.close()
print "done. Parsing..."
soup = BeautifulSoup(s)
soup = soup.find('table', {'width' : '600'}) # find the main table HTML
soup = soup.contents[0].contents[0] # descend down <tr> and then <td>
# iterate over this giant linear dump they have on the proceedings site
venue = 'NIPS %d' % (year,)
new_pub = {}
old_count = len(pubs)
for item in soup.contents:
if isinstance(item, Tag):
if item.name == 'b':
# we stumbled by a new publication entry. If we were processing
# one before this, then commit that one first then continue
if new_pub:
if not new_pub.has_key('authors'):
warnings.append("oh oh no authors for publication... ")
if not new_pub.has_key('title'):
warnings.append("oh oh no title for publication... ")
new_pub['venue'] = venue
new_pub['year']= year
pubs.append(new_pub)
# start new publication dictionary
new_pub = {}
new_title = str(item.contents[0]) # descend down a <b> tag
new_title = new_title.replace('\n', '')
new_pub['title'] = new_title
if item.name == 'a':
modifier = str(item.contents[0]).strip()
if modifier == '[pdf]':
new_pub['pdf'] = str(item.attrs[0][1])
elif modifier == '[bibtex]':
new_pub['bibtex'] = str(item.attrs[0][1])
elif modifier == '[correction]':
new_pub['correction'] = str(item.attrs[0][1])
elif modifier == '[supplemental]':
new_pub['supplemental'] = str(item.attrs[0][1])
elif modifier == '[slide]':
new_pub['slide'] = str(item.attrs[0][1])
elif modifier == '[audio]':
new_pub['audio'] = str(item.attrs[0][1])
elif modifier == '[ps.gz]':
pass # ignore
elif modifier == '[djvu]':
pass # ignore
else:
warnings.append("warning: modifier %s skipped" %(modifier,))
if isinstance(item, NavigableString):
if len(str(item))>3:
# this is probably the line describing authors
author_str = str(item)
author_str = author_str.replace('\n', '') # remove newlines
author_list = author_str.split(',')
if new_pub.has_key('authors'):
warnings.append("we're in trouble... %s, but already have "\
"%s" % (str(item), new_pub['authors']))
new_pub['authors'] = [x.strip() for x in author_list]
# I hate myself a little for this
# TODO LATER_MAYBE: CODE CHUNK DUPLICATION
if not new_pub.has_key('authors'):
warnings.append("oh oh no authors for publication... ")
if not new_pub.has_key('title'):
warnings.append("oh oh no title for publication... ")
new_pub['venue'] = venue
new_pub['year']= year
pubs.append(new_pub)
print "read in %d publications for year %d." % (len(pubs) - old_count, year)
# show warnings, if any were generated
if len(warnings)>0:
print "%d warnings:" % (len(warnings),)
for x in warnings:
print x
else:
print "No warnings generated."
# finally, save pickle as output
print "read in a total of %d publications." % (len(pubs),)
fname = "pubs_nips"
print "saving pickle in %s" % (fname,)
savePubs(fname, pubs)
print "all done."