-
Notifications
You must be signed in to change notification settings - Fork 2
/
tlm.py
143 lines (124 loc) · 5.66 KB
/
tlm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from bs4 import BeautifulSoup
import csv
import re
import sys
import os
import urllib2
import html5lib
from numpy import genfromtxt
import time
import operator
from google import search
tropelist = []
numoftropes = 0
start = time.time()
def imdbscraper(subject):
newsubject = subject + ' tv site:imdb.com' #DONT FORGET TO CHANGE THIS TV
for url in search(newsubject, stop=1):
startingurl = url
break
request = urllib2.Request(get_redirected_url((startingurl)))
url = urllib2.urlopen(request)
soup = BeautifulSoup(url, 'html5lib')
try:
rating = float(soup.find('div', {'class':'titlePageSprite star-box-giga-star'}).contents[0])
print 'imdb rating for '+ subject + ' is: ', rating
return rating
except AttributeError:
print 'No imdb rating found. Guessing: 5.0'
return 5.0
#compare the tropelist to the master tropelist and return a 1 dimensional binary array
def binarizer(tropelist):
mastertropelist = genfromtxt('mastertropelist.csv', dtype='str', delimiter=',')
tropearray = []
for item in mastertropelist:
if item in tropelist:
tropearray += [1]
else:
tropearray += [0]
return tropearray
#need the redirect handler to account for links on the tropeswiki page concerning other series or creators. the link on the page contains /Main/ but the link it redirects to has /Series/ /Creator/ etc
def get_redirected_url(url):
opener = urllib2.build_opener(urllib2.HTTPRedirectHandler)
request = opener.open(url)
return request.url
#extracting the tropeName string from the url
def tropescraper(url):
global tropelist
global numoftropes
try: #catch value errors that occur for <soup.findAll('a', {'class':'twikilink'})> items that don't contain a url
#convert link to the redirect url
link = str(get_redirected_url(url))
if 'php/Main' in link:
#truncate everything that is not the trope
trope = (link[43:].split('?'))[0]
#add it to the tropelist if trope not in tropelist cause many pages contain duplicates
if trope not in tropelist:
tropelist += [trope]
#print trope
numoftropes+=1
except ValueError: pass
def main(startingurl):
#access page and initialize alternatetropeliststyle and declare global variables
request = urllib2.Request(startingurl)
url = urllib2.urlopen(request)
soup = BeautifulSoup(url, 'html5lib')
#need this alternate tropeliststyle to account for pages that don't use the newer clickable "folder" system. this uses new webpages instead
alternatetropeliststyle = []
global tropelist
global numoftropes
subject = re.sub(':','',((soup.find('div', {'class':'pagetitle'}).contents)[1].contents[0]).encode('UTF-8'))
#scrape imdbrating and convert to hundredized form
rating = int(imdbscraper(subject)*10)
#find everything with this name and tag. includes all of the links in the folders at the bottom
for item in soup.findAll('a', {'class':'twikilink'}):
#check if one of the links found contains TropesAToD, etc, an indicator the alternate tropelist style was implemented
if not re.search("Tropes.To.",item['href']):
tropescraper(item['href'])
else:
alternatetropeliststyle += [item['href']]
#if alternate tropeslist was used open each "TropesAToD" etc link and run tropescraper on it after making it into soup
if alternatetropeliststyle:
for listedurl in alternatetropeliststyle:
request = urllib2.Request(listedurl)
url = urllib2.urlopen(request)
soup = BeautifulSoup(url, 'html5lib')
for item in soup.findAll('a', {'class':'twikilink'}):
tropescraper(item['href'])
print "Total number of tropes found: ", numoftropes
#dynamically name and create tropelists for works
filename = subject + ' tropelist.csv'
path_to_script_dir = os.path.dirname(os.path.abspath("tlm.py")) #create a new file no matter what
#don't forget to manually change the folder name before starting a new media format!
newpath = path_to_script_dir + r'\\' + 'WesternAnimation' + r'\\'
if not os.path.exists(newpath): os.makedirs(newpath)
with open(newpath + filename, 'wb') as f:
writer = csv.writer(f)
for trope in tropelist:
writer.writerow([trope])
f.close()
#convert tropelist to array of 1's and 0's and put that array in an array that also contains the work title, rating (y-data), and total num of tropes found
#put all this in a master array list to analyze with the classifier
tropearray = binarizer(tropelist)
with open('masterarraylist.csv', 'ab') as f:
writer = csv.writer(f)
writer.writerow([subject,tropearray,rating,numoftropes])
numoftropes=0
tropelist=[]
f.close()
#media = raw_input('"Anime", ComicStrip","Webcomic","ComicBook","Film","VideoGame","Series","Literature","WesternAnimation"')
#subject = raw_input("Enter work you want to analyze (all one word and case matters!): ")
#imdbrating = raw_input("Please provide a rating from 1-10 for this work: ")
# media = 'WesternAnimation'
# subject = 'AdventureTime'
# rating = 90 #use 1-100 scale or imdb, make dictionary for genres
# webcrawler("http://tvtropes.org/pmwiki/pmwiki.php/" + media +"/" + subject, imdbrating)
# #automatically go through every work on tvtropes for a given medium, in this case WesternAnimation
with open('WesternAnimation.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
print row
main(row[0])
f.close()
end = time.time()
print "Time elapsed:", end-start