forked from jmopr/job-hunter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.rb
145 lines (132 loc) · 3.84 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
require 'capybara/dsl'
require 'capybara-webkit'
require 'cgi'
require 'timeout'
require 'capybara'
require './matcher'
class JobScraper
include Capybara::DSL
def initialize(url, userID, pages)
Capybara.default_driver = :webkit
Capybara.javascript_driver = :webkit
# Capybara.default_driver = :selenium
# Capybara.javascript_driver = :selenium
Capybara::Webkit.configure do |config|
config.allow_url("www.indeed.com")
config.block_unknown_urls
end
@job_links = []
@url = url
@user = User.find(userID)
@counter = 1
@pages = pages.to_i
end
def scrape(skillset, region)
@skillset = skillset
@region = region
visit @url
sleep(1)
perform_search
search_jobs
end
def search_jobs
close_modal
filter_jobs
gather_requirements do |job_reqs|
match = matching_algorithm(job_reqs).round(2)
company = page.first(".company").text.downcase
job = Job.create(title: page.first(".jobtitle").text,
description: job_reqs.join(" "),
company: company,
location: page.first(".location").text,
post_date: page.first(".date").text,
url: page.current_url,
score: match,
applied: false,
user_id: @user.id
)
if match > 40
job.update(logo: Job.autocomplete(company))
end
job.update(hex_id: Job.hex(job.id.to_s))
end
@counter += 1
if @counter <= @pages
next_page
end
end
def next_page
if page.find('.np', match: :first).text == "Next »"
continue = page.find('.np', match: :first)
elsif page.all('.np').length > 1
continue = page.all('.np')[1]
else
continue = "stop"
end
unless continue == "stop"
next_link = continue.find(:xpath, '../..')
next_link.click
sleep(1)
if page.has_selector?('.popover-x-span')
page.find('.popover-x-span').click
end
sleep(1)
search_jobs
end
end
def gather_requirements
@job_requirements = {}
# visit the job listing page for a given job
@job_links.each do |link|
# get a handle on the new window so we capybara can update the page
job_listing_window = window_opened_by { link.click }
sleep(3)
# in the job listings page we opened in a new tab print the job description
within_window job_listing_window do
reqs = extract_requirements
if reqs != nil
if block_given?
yield reqs
else
@job_requirements[link['href']] = matching_algorithm(reqs)
end
end
end
end
@job_links = []
@job_requirements unless block_given?
end
def extract_requirements
if page.has_selector?("#job-content #job_summary ul li")
page.all("#job-content #job_summary ul li").map{|x| x.text}
elsif page.has_selector?("#job-content #job_summary")
page.find("#job-content #job_summary").text.split('.')
end
end
def filter_jobs
@jobs = all("#resultsCol .row.result")
# filter out sponsored jobs && accept "easily apply" jobs
@easy_jobs = @jobs.select { |job| job.all('.sdn').length == 0 && job.all('.iaP > span.iaLabel').length > 0 }
# get ONLY the job title link
@easy_jobs.each do |job|
job.all("a.turnstileLink[data-tn-element='jobTitle']").each do |x|
@job_links << x
end
end
@job_links
end
def close_modal
# check if there is a modal that needs to be closed
if page.has_selector?('#prime-popover-close-button')
page.find('#prime-popover-close-button').click
end
end
def perform_search
# For indeed
fill_in 'q', :with => @skillset
fill_in 'l', :with => @region
find('#fj').click
sleep(1)
end
end
JobScraper.new('http://www.indeed.com/', ARGV[0], ARGV[3]).scrape(ARGV[1], ARGV[2])