This repository has been archived by the owner on May 30, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
fetch.go
53 lines (45 loc) · 1.69 KB
/
fetch.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
package goscholar
import (
"errors"
"github.com/PuerkitoBio/goquery"
log "github.com/Sirupsen/logrus"
"net/http"
"strings"
)
// Fetch gets a Document from a given URL. For usage, see the example of Overview.
func Fetch(url string) (doc *goquery.Document, err error) {
log.WithFields(log.Fields{"url": url}).Info("Fetch sends request")
// set request
req, err := http.NewRequest("GET", url, nil)
if err != nil {
log.WithFields(log.Fields{"url": url, "err": err}).Error("Failed to generate new request")
return nil, err
}
req.Header.Set("User-Agent", USER_AGENT)
// send request and get response
client := http.DefaultClient
res, err := client.Do(req)
if err != nil {
log.WithFields(log.Fields{"url": url, "err": err}).Error("Failed to get response")
return nil, err
}
// generate new Document
doc, err = goquery.NewDocumentFromResponse(res)
log.WithFields(log.Fields{"doc.url": doc.Url}).Info("goquery.Document is generated")
if err != nil {
log.WithFields(log.Fields{"url": url, "err": err}).Error("Generating goquery.Documentation failed")
return nil, err
}
// 1. check the "Please show you're not a robot" page. See #61
// 2. check the "We're sorry..."
if s := doc.Find("h1").First().Text(); strings.Contains(s, "robot") || strings.Contains(s, "sorry") {
log.WithFields(log.Fields{"h1": s, "doc.Url": doc.Url}).Error("Robot check occurs")
return nil, errors.New("Failed to fetch Document")
}
// check the "To continue, please type the characters below:". See #55
if strings.Contains(doc.Url.String(), "sorry") {
log.WithFields(log.Fields{"doc.Url": doc.Url}).Error("Request is rejected from Google")
return nil, errors.New("Failed to fetch Document")
}
return doc, nil
}