-
Notifications
You must be signed in to change notification settings - Fork 2
/
newmastermaker.py
54 lines (45 loc) · 1.87 KB
/
newmastermaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import csv
import urllib2
import html5lib
from bs4 import BeautifulSoup
from numpy import genfromtxt
mastertropelist = []
numoftropes = 0
#extracting the tropeName string from the url
def tropescraper(url):
global mastertropelist
global numoftropes
try: #catch value errors that occur for <soup.findAll('a', {'class':'twikilink'})> items that don't contain a url
#convert link to the redirect url
link = str(get_redirected_url(url))
if 'php/Main' in link:
#truncate everything that is not the trope
trope = (link[43:].split('?'))[0]
#add it to the tropelist if trope not in tropelist cause many pages contain duplicates
mastertropelist = genfromtxt('mastertropelist.csv', dtype='str', delimiter=',')
if trope not in mastertropelist:
mastertropelist += [trope]
print trope
numoftropes+=1
except ValueError: pass
#need the redirect handler to account for links on the tropeswiki page concerning other series or creators. the link on the page contains /Main/ but the link it redirects to has /Series/ /Creator/ etc
def get_redirected_url(url):
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler)
request = opener.open(url)
return request.url
with open('linklist.csv', 'r') as linklist:
reader = csv.reader(linklist, delimiter='\n')
for row in reader:
linkstr = row.pop()
request = urllib2.Request(linkstr)
url = urllib2.urlopen(request)
soup = BeautifulSoup(url, 'html5lib')
for item in soup.findAll('a', {'class':'twikilink'}):
tropescraper(item['href'])
linklist.close()
with open('newmasterlist.csv', 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
for trope in mastertropelist:
writer.writerow([trope])
csvfile.close()
print numoftropes