-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.rb
189 lines (163 loc) · 4.9 KB
/
crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
require "mechanize"
require 'json'
require 'matrix'
require 'tf-idf-similarity'
# used to determine if link is restricted by robots.txt
def is_restricted?(link)
restricted = false
@dont_go.each do |dg|
if link.to_s.include?('dontgohere')
restricted = true
end
end
restricted
end
# used to find the frequencies of each word in index
def frequencies(words)
Hash[
words.group_by(&:downcase).map{ |word,instances|
[word,instances.length]
}.sort_by(&:last).reverse
]
end
url = "http://lyle.smu.edu/~fmoore/"
robots = "http://lyle.smu.edu/~fmoore/robots.txt"
## reveiving command line arguments
# number of pages read in argument
if ARGV[0]
num_pages = ARGV[0].to_i
else
num_pages = 100
end
#variables used for File IO
@pages_file = File.new("pages.txt", "w")
@pages_file.write("\n")
@frequency_index = File.new("freq.txt", "w")
@frequency_index.write("Word Frequency \n\n")
@fp = File.new("links.txt", "w")
@fp.write("Links \n\n")
@tokens = File.new("tokens.txt", "w")
@visited = []
stop_words = []
# Retreives list of stop words and stores it in array
File.open("stop_words.txt", "r") do |f|
f.each_line do |line|
word = line.strip
stop_words << word
end
end
@agent = Mechanize.new { |agent| agent.user_agent_alias = "Mac Safari"}
## Get robots.txt
html = @agent.get(url).body
body = ''
@agent.get(robots) do |page|
body = page.body
end
## Put robots.txt info inside of array for places to not go
## used in is_restricted? method
@dont_go = []
body.each_line do |line|
unless /^#/.match(line)
unless /^User-agent/.match(line)
token = line.split(": ")
@dont_go << token[1].tr('/','').to_s
end
end
end
## Scraping Begins
html_doc = Nokogiri::HTML(html)
@pages_file.write(url + "\n")
link_list = []
#block that gets first first page of the site and collects all links on page
@agent.get("http://lyle.smu.edu/~fmoore/") do |page|
page.links_with(:href => //).each do |link|
if link.href.to_s.include?(':') #determine if link is full link
link_list << link.href.to_s
else # if not full link, make full link to avoid 404 errors
link_list << "http://lyle.smu.edu/~fmoore/#{link.href.to_s}"
end
end
end
all_text = ''
all_tokens = {}
id = '' #used for document id
num = 1 #used to determine how many pages to scrape
num_jpegs = 0 #keeps track of number of jpegs in site
# main driver of scraper
link_list.each do |link|
#if reached end of max number of pages, stop scraping.
if num >= num_pages
break
end
unless !link.include?('lyle.smu.edu/~fmoore/') || is_restricted?(link) || @visited.include?(link)
# 404 error handling with links
begin
page = @agent.get(link)
rescue Exception => e
page = e.page
end
# => increment id and num for each iteration
id = link
num = num + 1
@visited << link # => add link to visited list
@pages_file.write(link.to_s + "\n") # => write visited link to file
html_doc = Nokogiri::HTML(page.body)
text = html_doc.at('body').inner_text
all_text << text # => adds text from body to all text for token frequency determination
# Pretend that all words we care about contain only a-z, 0-9, or underscores
text.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
tokens = text.scan(/[a-z]+/i) #only take in words with a - z
new_tokens = []
## Get rid of stop words
tokens.each do |token|
token.downcase!
unless stop_words.include?(token)
new_tokens << token
end
end
@tokens.write("#{link}\n")
new_tokens.each do |token|
@tokens.write("#{token} ")
end
@tokens.write("\n")
# => collects all unique occurences of words and creates doc_id data structure
# tokens.each do |token|
# token.downcase!
# if all_tokens.key?(token) && !stop_words.include?(token)
# if all_tokens[token].key?("#{id}")
# all_tokens[token]["#{id}"] = all_tokens[token]["#{id}"] + 1
# else
# all_tokens[token]["#{id}"] = 1
# end
# elsif !stop_words.include?(token)
# all_tokens[token] = {}
# all_tokens[token]["#{id}"] = 1
# end
# end
# => gathers all links on page and adds to list.
links = html_doc.xpath("//a/@href")
new_link = link.to_s
new_link.sub!(link.split('/').last.to_s, "")
links.each do |link2|
if link2.to_s.include?('http')
link_list << link2
elsif link2.to_s.include?('mailto:')
#do nothing with link
else
link_list << "#{new_link}#{link2}"
end
end
end
end
# creates word frequency index file
all_text.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
words = all_text.scan(/[a-z]+/i)
frequencies(words).each do |word|
unless stop_words.include?(word[0])
@frequency_index.write("#{word[0]} => #{word[1]} \n")
end
end
# lists all links
link_list.each do |link|
@fp.write(link.to_s + "\n")
end