This repository has been archived by the owner on May 30, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
parse.go
148 lines (124 loc) · 3.45 KB
/
parse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package goscholar
import (
"errors"
"fmt"
"github.com/PuerkitoBio/goquery"
log "github.com/Sirupsen/logrus"
"github.com/sotetsuk/gobibtex"
"regexp"
"strings"
"time"
)
// ParseDocument sends the pointers of parsed Articles to the given channel.
// The channel will be closed if there are no articles to be sent.
func ParseDocument(ch chan *Article, doc *goquery.Document) {
defer close(ch)
parse := func(i int, s *goquery.Selection) {
a, err := ParseSelection(s)
if err != nil {
log.Error(err)
return
}
ch <- a
}
doc.Find(whole_article_selector).Each(parse)
}
// ParseSelection returns a parsed Article.
// If the Article is not valid (e.g., Author profile), it returns error.
func ParseSelection(s *goquery.Selection) (a *Article, err error) {
a = &Article{}
a.Title = parseH3(s)
a.Year = parseGreenLine(s)
a.ClusterId, a.NumCite, a.NumVer, a.InfoId = parseBottom(s)
a.Link = parseSideBar(s)
if a.InfoId != "" {
time.Sleep(1.0 * time.Second) // TODO: make 1.0 parameter
a.BibTeX, err = getBibTeX(generateBibTeXLink(a.InfoId))
if err != nil {
return nil, err
}
}
if a.BibTeX != "" {
bibmap, err := gobibtex.Decode(a.BibTeX)
if err != nil {
return nil, err
}
if author, ok := bibmap["author"]; ok {
a.Author = author.([]string)
}
if journal, ok := bibmap["journal"]; ok {
a.Journal = journal.(string)
}
if booktitle, ok := bibmap["booktitle"]; ok {
a.Booktitle = booktitle.(string)
}
if volume, ok := bibmap["volume"]; ok {
a.Volume = volume.(string)
}
if number, ok := bibmap["number"]; ok {
a.Number = number.(string)
}
if pages, ok := bibmap["pages"]; ok {
a.Pages = pages.(string)
}
if publisher, ok := bibmap["publisher"]; ok {
a.Publisher = publisher.(string)
}
}
if !a.isValid() {
return nil, errors.New(fmt.Sprintf("\"%v\" is not a valid article", a.Title.Name))
}
return a, nil
}
// parseH3 an article title and its link
func parseH3(s *goquery.Selection) (title *Title) {
title = &Title{}
h3 := s.Find(article_h3_selector)
url, exists := h3.Attr("href")
if exists {
title.Url = url
title.Name = h3.Text()
} else {
name := s.Find("h3").Text()
rep, _ := regexp.Compile("\\[[a-zA-Z0-9]*\\]\\[[a-zA-Z0-9]*\\]\\s")
title.Name = rep.ReplaceAllString(name, "")
}
return title
}
// parseGreenLine parse article published year
func parseGreenLine(s *goquery.Selection) (year string) {
year = parseYearText(s.Find(article_green_line_selector).Text())
return year
}
// parseBottom parse the line under the abstract
func parseBottom(s *goquery.Selection) (clusterId, numCite, numVer, infoId string) {
divFooter := s.Find(article_bottom_selector)
parseFooter := func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
text := s.Text()
if strings.HasPrefix(href, "/scholar?cites") {
clusterId = parseClusterIdText(href)
numCite = parseNumCiteText(text)
}
if strings.HasPrefix(href, "/scholar?cluster") {
numVer = parseNumVerText(text)
}
if strings.HasPrefix(href, "/scholar?q=related") {
infoId = parseInfoIdText(href)
}
}
divFooter.Find("a").Each(parseFooter)
return clusterId, numCite, numVer, infoId
}
// parseSideBar parse the right side link
func parseSideBar(s *goquery.Selection) (link *Link) {
link = &Link{}
sideBarA := s.Find(article_sidebar_selector)
url, exists := sideBarA.Attr("href")
if !exists {
return link
}
link.Url = url
link.Name, link.Format = parseLinkText(sideBarA.Find(sidebar_text_selector).Text())
return link
}