-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.go
169 lines (149 loc) · 4 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
package collyresponsible
import (
"context"
"fmt"
"net/http"
"net/url"
"strings"
"github.com/gocolly/colly/v2"
"github.com/temoto/robotstxt"
)
func TestRobotsGroup(robots *robotstxt.RobotsData, url, userAgent string) bool {
status := robots.TestAgent(url, userAgent)
if !status {
return false
}
// Test all user agents
parts := strings.Split(strings.TrimRight(userAgent, ")"), ";")
for _, part := range parts {
part = strings.TrimSpace(part)
// url support
newPart := strings.TrimPrefix(strings.TrimPrefix(part, "+https://"), "http://")
// version support
newPart = strings.TrimSpace(strings.Split(newPart, "/")[0])
if !robots.TestAgent(url, newPart) {
return false
}
}
return status
}
func Crawl(profile *CrawlerProfile) (err error) {
parsed, err := url.Parse(profile.Website)
if err != nil {
return err
}
// Get robots.txt
limiter := NewLimiter(2)
robots, err := GetRobots(profile.Ctx, profile.Website, profile.UserAgent, limiter)
if err != nil {
return err
}
// Check if the user agent is allowed to visit the website
if !TestRobotsGroup(robots, profile.Website, profile.UserAgent) {
return fmt.Errorf("User agent is not allowed to visit the website")
}
// Sleep after getting robots.txt
limiter.Sleep()
//
visitMap := NewVisitMap()
collectorOptions := []colly.CollectorOption{
// Does not work with Async
// colly.Async(),
colly.UserAgent(profile.UserAgent),
colly.CheckHead(),
}
if len(profile.CollyOptions) > 0 {
collectorOptions = append(collectorOptions, profile.CollyOptions...)
}
// Instantiate default collector
c := colly.NewCollector(collectorOptions...)
// Use empty limit rule for collector
if profile.CollyLimits == nil {
profile.CollyLimits = &colly.LimitRule{DomainGlob: "*"}
}
//
c.Limit(profile.CollyLimits)
// Pass down URL from request to response context
c.OnRequest(func(r *colly.Request) {
r.Ctx.Put("url", r.URL.String())
})
// After making a request get "url" from
// the context of the request
c.OnResponse(func(r *colly.Response) {
for _, fn := range profile.ResponseHooks {
fn(r)
}
})
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// this works
if strings.ToLower(e.Attr("rel")) == NoFollow {
fmt.Printf("%s: %s\n", NoFollow, link)
return
}
absoluteLink := e.Request.AbsoluteURL(link)
// Print link
// Visit link found on page on a new thread
currentHost, err := url.Parse(absoluteLink)
if err != nil {
return
}
// Always skip already visited links
if visitMap.IsVisited(absoluteLink) {
return
}
// Skip non HTTP links
if !(strings.HasPrefix(absoluteLink, "http://") || strings.HasPrefix(absoluteLink, "https://")) {
return
}
// Pass URL to hooks before checking if the link is allowed
for _, fn := range profile.URLHooks {
if absoluteLink != "" {
fn(absoluteLink)
}
}
//
if currentHost.Host != parsed.Host {
return
}
// Check if the user agent is allowed to visit the website
// absolute links don't work with robots.txt tester
if !robots.TestAgent(link, profile.UserAgent) {
fmt.Println("robots: ", link)
return
}
// Check if the link is allowed
for _, fn := range profile.URLTests {
if !fn(absoluteLink) {
return
}
}
//
limiter.Sleep()
fmt.Println("Visiting", absoluteLink)
c.Visit(absoluteLink)
visitMap.Add(absoluteLink)
})
// Set error handler
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
if r.StatusCode == http.StatusTooManyRequests {
limiter.Increase()
}
if r.StatusCode == http.StatusOK {
limiter.Decrease()
}
})
// Wait until threads are finished
runCtx, cancel := context.WithTimeout(context.Background(), profile.MaxRuntime)
defer cancel()
// Start scraping
go func() {
// c.Wait() only makes sense with colly.Async(), so we just start goroutine
c.Visit(profile.Website)
cancel()
}()
<-runCtx.Done()
return nil
}