-
Notifications
You must be signed in to change notification settings - Fork 0
/
moogle.py
253 lines (184 loc) · 7.42 KB
/
moogle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import sys
import util
import math
import pickle
import urllib3
import requests
import operator
import pylab as plt
import networkx as nx
from bs4 import BeautifulSoup
from collections import deque
from stop_words import get_stop_words
from networkx import DiGraph, pagerank
from urllib.parse import urljoin, urlparse, urldefrag
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
STOP_WORDS = set(get_stop_words('en') +
get_stop_words('ca') + get_stop_words('es'))
DOMAIN = ''
BASE = ''
PLOT = False # switch to True if you want the crawler to plot the directed graph.
sys.setrecursionlimit(50000) # Uncomment if your system needs this.
#############################################################################
# Common part
#############################################################################
def authors():
"""Returns a string with the name of the authors of the work."""
return "Oriol Domingo, Pol Baladas"
#############################################################################
# Crawler
#############################################################################
def store(db, filename):
with open(filename, "wb") as f:
print("store", filename)
pickle.dump(db, f)
print("done")
def sanitizeText(text):
try:
text = util.clean_words(text)
except:
text.encode('utf-8')
text = text.split(' ')
filter(None, text)
return [word for word in text if word not in STOP_WORDS]
def isVisible(element): # Boolean Function that will be used to filter visible text.
return element.parent.name not in [
'style', 'script', 'head', '[document]'
]
def filterVisibleText(text):
visible = filter(isVisible, text) # Pass isVisible filter through text.
return u" ".join(t.strip() for t in visible) # Join results in string.
def getText(soup):
text = filterVisibleText(soup.findAll(text=True))
text = sanitizeText(text)
return set(text) # Repeated instances are removed.
def scrapeSite(soup, url, db):
text = getText(soup)
for word in text:
if word not in db['words']:
db['words'][word] = set([url])
else:
db['words'][word].add(url)
def sanitizeUrl(url):
return urljoin(BASE, url).strip('/')
def getSoup(url): # Try/Catch block to prevent Bad Content being processed.
try:
response = requests.get(url, verify=False, timeout=0.5) # As an alternative design option, one could check for the content type of the response.
return BeautifulSoup(response.text) # e.g: this would only work if the content type was 'text/html'.
except:
print("Error: Bad Content, skipping link. Do not stop.")
return None # Return None if the URL could not be processed. The Crawler will understand.
def getDomain(url):
return urlparse(url).netloc # netloc --> Network Location means domain.
def isFromDomain(url):
domain = getDomain(url)
return (url[0:4] != 'http') or (domain == DOMAIN) # Check (1) absolute/relative paths and (2) domain procedence.
def isValidUrl(url): # Boolean function that will be used to filter valid URLs.
return ( # If the conditions need to be changed, modify this function.
"mailto:" not in url['href'] and
'#' not in url['href'] and
isFromDomain(url['href'])
)
def getLinks(soup):
results = []
links = filter(isValidUrl, soup.find_all('a', href=True)) # Pass isValidURL filter through 'a' tags.
for link in list(links):
url = sanitizeUrl(link['href']) # Sanitize the URL format-wise.
results.append(url)
return results
def addSite(soup, url):
return {
'url': url,
'title': soup.title.string if soup.title else 'No title', # Strangely enough, some websites have no title tag or it is found to be empty.
'score': 0
}
def BFS_crawler(url, expdist, db, G):
links_queue = deque()
links_queue.appendleft([expdist, url]) # Queue policies FIFO
visit = set()
while len(links_queue):
dist, url = links_queue.pop()
soup = getSoup(url)
if soup:
db['pages'][url] = addSite(soup, url) # Add site to database
scrapeSite(soup, url, db)
if dist > 0:
links = getLinks(soup)
for link in links:
G.add_edge(url, link)
if not link in visit:
visit.add(link)
print(link)
links_queue.appendleft([dist-1, link])
else:
db['pages'][url] = {}
def plotGraph(G):
nx.draw(G, with_labels=True)
plt.plot()
plt.show()
def pageRank(G, db):
pr = pagerank(G)
for element in pr.keys():
score = math.ceil(pr[element] * 10000) # Ceil the PageRank score for design p
db['pages'][element]['score'] = score
def crawler(url, maxdist):
"""
Crawls the web starting from url,
following up to maxdist links
and returns the built database.
"""
global DOMAIN
global BASE
DOMAIN = getDomain(url)
BASE = url
db = {
"pages": {},
"words": {}
}
G = DiGraph([])
print("Crawling", url)
BFS_crawler(url, maxdist, db, G)
print("Computing PageRank...")
pageRank(G, db)
if PLOT:
print("Plotting BFS...")
plotGraph(G)
return db
#############################################################################
# Answer
#############################################################################
def load(filename):
"""Reads an object from file filename and returns it."""
with open(filename, "rb") as f:
print("load", filename)
db = pickle.load(f)
print("done")
return db
def answer(db, query):
"""
Returns a list of pages for the given query.
Each page is a map with three fields:
- title: its title
- url: its url
- score: its score
The list is sorted by score in descending order.
The query is a string of cleaned words.
"""
words = db["words"]
pages = db["pages"]
queries = query.split(' ')
results = []
for query in queries:
if query in words:
results.append(words[query])
else:
results.append(set()) # We add an empty set for the intersection to work as desired.
if not len(results):
return results
result_set = results[0].intersection(*results) # Calculates the intersection of result sets (queries).
web_results = []
for url in result_set:
web_results.append(db["pages"][url]) # Fill list to be returned.
web_results.sort(key=operator.itemgetter('score'), reverse=True)
return web_results