forked from Mondego/spacetime-crawler4py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
133 lines (110 loc) · 4.96 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import re
from urllib.parse import urlparse, urldefrag,urljoin,urlunparse
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
# Need to Use a Tuple so it is saved across files
longestPage = (float('-inf'), "")
seenURLS = set()
word_count = {}
stop_words = set(line.strip() for line in open('stopwords.txt'))
icsSubdomains = {}
def scraper(url, resp):
links = extract_next_links(url, resp)
return [link for link in links if is_valid(link)]
def extract_next_links(url, resp):
# Implementation required.
# url: the URL that was used to get the page
# resp.url: the actual url of the page
# resp.status: the status code returned by the server. 200 is OK, you got the page. Other numbers mean that there was some kind of problem.
# resp.error: when status is not 200, you can check the error here, if needed.
# resp.raw_response: this is where the page actually is. More specifically, the raw_response has two parts:
# resp.raw_response.url: the url, again
# resp.raw_response.content: the content of the page!
# Return a list with the hyperlinks (as strings) scrapped from resp.raw_response.content
global longestPage
global wordCount
global stop_words
if resp.status != 200 or resp.raw_response.content == None:
return list()
soup = BeautifulSoup(resp.raw_response.content, 'html5lib')
#get rid of script and style content from the raw response
for tag in soup(["script", "style"]):
tag.extract()
tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
tokens = tokenizer.tokenize(soup.get_text().lower())
# Removes low information pages from scraper
if len(tokens) < 100:
return list()
pageWords = 0 # Used to find page with most words minus stop words
for word in tokens:
#filters out stopwords
if word not in stop_words:
pageWords += 1
if word in word_count:
word_count[word] += 1
else:
word_count[word] = 1
if pageWords > longestPage[0]:
longestPage = (pageWords, url)
with open('report.txt','r') as f:
data = f.readlines()
data[1] = (f"The Longest Page is {longestPage[1]}\n")
data[2] = (f"The longest Page has {longestPage[0]} words\n")
with open('report.txt','w') as f:
f.writelines(data)
links = []
for link in soup.find_all("a", attrs={'href': re.compile("^http://|^https://")}):
links.append(link.get('href'))
return links
def is_valid(url):
# Decide whether to crawl this url or not.
# If you decide to crawl it, return True; otherwise return False.
# There are already some conditions that return False.
url = urldefrag(url).url
global seenURLS
global icsSubdomains
parsed = urlparse(url)
try:
validSubDomain = re.match(
".*(\.cs\.uci\.edu\/|\.informatics\.uci\.edu\/|\.stat\.uci\.edu\/|\.ics\.uci\.edu\/).*|.*today\.uci\.edu\/department\/information_computer_sciences\/.*", url)
if not validSubDomain:
return False
if url in seenURLS:
return False
# Looks for the subdomains, ensures it is not www.ics.uci.edu
if re.match(".*(\.ics\.uci\.edu)(?<!www.ics.uci.edu)", parsed.netloc):
subdomain = parsed.netloc
if subdomain in icsSubdomains:
icsSubdomains[subdomain] += 1
else:
icsSubdomains[subdomain] = 1
seenURLS.add(url)
# Writes the number of Unique URLS
with open('report.txt','r') as file:
data = file.readlines()
data[0] = f'The number of Unique URLS is: {len(seenURLS)}\n'
with open('report.txt','w') as file:
file.writelines(data)
print(len(seenURLS))
return not (re.match(
r".*\.(css|js|bmp|gif|jpe?g|ico|py|java|c"
+ r"|png|tiff?|mid|mp2|mp3|mp4"
+ r"|wav|avi|mov|mpeg|ram|m4v|mkv|odc|ogg|ogv|pdf"
+ r"|ps|eps|tex|ppt|pptx|ppsx|doc|docx|xls|xlsx|names"
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+ r"|epub|dll|cnf|tgz|sha1"
+ r"|thmx|mso|arff|rtf|jar|csv"
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower()) or
re.match(
r".*\/(css|js|bmp|gif|jpe?g|ico|py|java|c"
+ r"|png|tiff?|mid|mp2|mp3|mp4"
+ r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
+ r"|ps|eps|tex|ppt|pptx|ppsx|doc|docx|xls|xlsx|names"
+ r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
+ r"|epub|dll|cnf|tgz|sha1"
+ r"|thmx|mso|arff|rtf|jar|csv"
+ r"|rm|smil|wmv|swf|wma|zip|rar|gz)\/.*$", parsed.path.lower()) or
re.match(r".*\/page\/\d*", parsed.path.lower()))
except TypeError:
print ("TypeError for ", parsed)
raise