Skip to content

Commit

Permalink
Implemented multi language scraping
Browse files Browse the repository at this point in the history
Not fully tested and only checked for character and free companys
  • Loading branch information
Weneg committed Mar 29, 2024
1 parent 79e0b71 commit 6ca2da0
Show file tree
Hide file tree
Showing 10 changed files with 113 additions and 55 deletions.
11 changes: 6 additions & 5 deletions cmd/example/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ import (
)

func main() {
logrus.SetReportCaller(true)
scraper := goxiv.GoXIV{}
char := scraper.ScrapeCharacter(10477093)
char := scraper.ScrapeCharacter(10477093, "jp")
b, err := json.Marshal(char)
if err != nil {
logrus.Error("Error: ", err)
Expand All @@ -20,7 +21,7 @@ func main() {
if err != nil {
logrus.Error("Could not write json ", err)
}
fc := scraper.ScrapeFreecompany(9232801448574584889)
fc := scraper.ScrapeFreecompany(9232801448574584889, "jp")
b, err = json.Marshal(fc)
if err != nil {
logrus.Error("Error: ", err)
Expand All @@ -29,7 +30,7 @@ func main() {
if err != nil {
logrus.Error("Could not write json ", err)
}
pvpteam := scraper.ScrapePvPTeam("50276fadbb2edce09708ed5171a93c2d05eaf701")
pvpteam := scraper.ScrapePvPTeam("50276fadbb2edce09708ed5171a93c2d05eaf701", "eu")
b, err = json.Marshal(pvpteam)
if err != nil {
logrus.Error("Error: ", err)
Expand All @@ -38,7 +39,7 @@ func main() {
if err != nil {
logrus.Error("Could not write json ", err)
}
worldlinkshell := scraper.ScrapeLinkshell("09fc154c707570cf2a3e12f48aff36ea2506e88c", true)
worldlinkshell := scraper.ScrapeLinkshell("09fc154c707570cf2a3e12f48aff36ea2506e88c", true, "eu")
b, err = json.Marshal(worldlinkshell)
if err != nil {
logrus.Error("Error: ", err)
Expand All @@ -47,7 +48,7 @@ func main() {
if err != nil {
logrus.Error("Could not write json ", err)
}
linkshell := scraper.ScrapeLinkshell("18858823439663593", false)
linkshell := scraper.ScrapeLinkshell("18858823439663593", false, "eu")
b, err = json.Marshal(linkshell)
if err != nil {
logrus.Error("Error: ", err)
Expand Down
2 changes: 1 addition & 1 deletion cmd/lambda/lambda.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ type Request struct {

func HandleRequest(ctx context.Context, id Request) (character.Character, error) {
scraper := goxiv.GoXIV{}
temp := scraper.ScrapeCharacter(id.ID)
temp := scraper.ScrapeCharacter(id.ID, "eu")
return temp, nil
}

Expand Down
42 changes: 28 additions & 14 deletions controller/characterfunctions.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func characterTitleHandler(data *character.Character) (string, func(e *colly.HTM

func characterServerDatacenterHandler(data *character.Character) (string, func(e *colly.HTMLElement)) {
return "p.frame__chara__world", func(e *colly.HTMLElement) {

var server model.Server
var datacenter model.Datacenter
datacenter.Name = Between(e.Text, "[", "]")
Expand Down Expand Up @@ -98,7 +99,7 @@ func characterBioHandler(data *character.Character) (string, func(e *colly.HTMLE
}

func characterTraitHandler(data *character.Character) (string, func(e *colly.HTMLElement)) {
return `p.character-block__title:contains("Race/Clan/Gender")`, func(e *colly.HTMLElement) {
return `p.character-block__title:contains("Race/Clan/Gender"),p.character-block__title:contains("Volk / Stamm / Geschlecht"),p.character-block__title:contains("Race / Ethnie / Sexe"),p.character-block__title:contains("種族/部族/性別")`, func(e *colly.HTMLElement) {
temp, _ := e.DOM.Siblings().Html()
if strings.Contains(temp, "♀") {
data.Sex = "♀"
Expand All @@ -113,7 +114,7 @@ func characterTraitHandler(data *character.Character) (string, func(e *colly.HTM
}

func characterCitystageHandler(data *character.Character) (string, func(e *colly.HTMLElement)) {
return `p.character-block__title:contains("City-state")`, func(e *colly.HTMLElement) {
return `p.character-block__title:contains("City-state"),p.character-block__title:contains("Stadtstaat"),p.character-block__title:contains("Cité de départ"),p.character-block__title:contains("開始都市")`, func(e *colly.HTMLElement) {
data.Citystate = e.DOM.Siblings().Text()
}
}
Expand All @@ -125,7 +126,7 @@ func characterNamedayHandler(data *character.Character) (string, func(e *colly.H

}
func characterGuardianHandler(data *character.Character) (string, func(e *colly.HTMLElement)) {
return `p.character-block__title:contains("Guardian")`, func(e *colly.HTMLElement) {
return `p.character-block__title:contains("Guardian"),p.character-block__title:contains("Schutzgott"),p.character-block__title:contains("Divinité"),p.character-block__title:contains("守護神")`, func(e *colly.HTMLElement) {
temp := e.DOM.SiblingsFiltered("p.character-block__name").Text()
data.Guardian = temp
}
Expand All @@ -151,7 +152,7 @@ func characterClassSpecialistHandler(data *character.Character) (string, func(e
} else {
work := BeforeLast(exp, " /")
if work != "--" {
tempexp, err := strconv.ParseInt(strings.ReplaceAll(work, ",", ""), 10, 64)
tempexp, err := strconv.ParseInt(strings.ReplaceAll(strings.ReplaceAll(strings.ReplaceAll(work, ",", ""), ".", ""), " ", ""), 10, 64)
if err != nil {
logrus.Error("Error while parsing EXP ", work, data.ID)
}
Expand Down Expand Up @@ -185,19 +186,22 @@ func characterClassHandler(data *character.Character) (string, func(e *colly.HTM
exp := e.DOM.SiblingsFiltered("div.character__job__exp").Text()
work := BeforeLast(exp, " /")
var class character.Class
if work == "--" && (strings.Contains(e.Text, "Blue Mage") || level == "90") {
if work == "--" && ((strings.Contains(e.Text, "Blue Mage") || strings.Contains(e.Text, "Blaumagier") || strings.Contains(e.Text, "Mage bleu") || strings.Contains(e.Text, "青魔道士")) || level == "90") {
class.Max = true
class.Name = e.Text
if strings.Contains(e.Text, "Blue Mage") {
class.Level = 70
class.Name = "Blue Mage"
if strings.Contains(e.Text, "Blue Mage") || strings.Contains(e.Text, "Blaumagier") || strings.Contains(e.Text, "Mage bleu") || strings.Contains(e.Text, "青魔道士") {
class.Level = 80
class.Name = BeforeFirst(e.Text, "(")
if class.Name == "" {
class.Name = BeforeFirst(e.Text, "[")
}
} else {
class.Level = 90
}
} else {
work := BeforeLast(exp, " /")
if work != "--" {
tempexp, err := strconv.ParseInt(strings.ReplaceAll(work, ",", ""), 10, 64)
tempexp, err := strconv.ParseInt(strings.ReplaceAll(strings.ReplaceAll(strings.ReplaceAll(work, ",", ""), ".", ""), " ", ""), 10, 64)
if err != nil {
logrus.Error("Error while parsing EXP ", work, data.ID)
}
Expand All @@ -211,8 +215,11 @@ func characterClassHandler(data *character.Character) (string, func(e *colly.HTM
logrus.Error("Error while parsing level")
}
class.Level = templevel
if strings.Contains(e.Text, "Blue Mage") {
class.Name = "Blue Mage"
if strings.Contains(e.Text, "Blue Mage") || strings.Contains(e.Text, "Blaumagier") || strings.Contains(e.Text, "Mage bleu") || strings.Contains(e.Text, "青魔道士") {
class.Name = BeforeFirst(e.Text, "(")
if class.Name == "" {
class.Name = BeforeFirst(e.Text, "[")
}
} else {
class.Name = e.Text
}
Expand All @@ -236,7 +243,7 @@ func characterBozjaHandler(data *character.Character) (string, func(e *colly.HTM
var temp character.Bozja
if strings.ReplaceAll(exp, ",", "") != "--" {

tempexp, err := strconv.ParseInt(strings.ReplaceAll(exp, ",", ""), 10, 64)
tempexp, err := strconv.ParseInt(strings.ReplaceAll(strings.ReplaceAll(exp, ",", ""), ".", ""), 10, 64)
if err != nil {
logrus.Error("Error while parsing EXP ", exp)
}
Expand All @@ -259,7 +266,7 @@ func characterEurekaHandler(data *character.Character) (string, func(e *colly.HT
var temp character.Eureka
if strings.ReplaceAll(exp, ",", "") != "--" {

tempexp, err := strconv.ParseInt(strings.ReplaceAll(exp, ",", ""), 10, 64)
tempexp, err := strconv.ParseInt(strings.ReplaceAll(strings.ReplaceAll(exp, ",", ""), ".", ""), 10, 64)
if err != nil {
logrus.Error("Error while parsing EXP ", exp)
}
Expand Down Expand Up @@ -318,7 +325,14 @@ func characterAchievementHandler(data *character.Character) (string, func(e *col
}
achievement.Unlocked = time.Unix(tempTime, 0)
achievement.ID = tempID
achievement.Name = After(BeforeLast(e.ChildText("p.entry__activity__txt"), "\""), "\"")
tempName := After(BeforeLast(e.ChildText("p.entry__activity__txt"), "\""), "\"")
if tempName == "" {
tempName = BeforeFirst(e.ChildText("p.entry__activity__txt"), " aus der Kategorie „")
}
if tempName == "" {
tempName = After(BeforeLast(e.ChildText("p.entry__activity__txt"), "」"), "「")
}
achievement.Name = tempName
data.Achievements = append(data.Achievements, achievement)
}

Expand Down
50 changes: 39 additions & 11 deletions controller/characterscraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ const (
CHARACTERENDPOINT = "/lodestone/character/"
)

func (c Controller) ScrapeCharacter(id int64) character.Character {
func (c Controller) ShallowScrapeCharacter(id int64) character.Character {
if id == 0 {
return character.Character{}
}
Expand All @@ -27,6 +27,26 @@ func (c Controller) ScrapeCharacter(id int64) character.Character {
colly.MaxDepth(2),
colly.Async(true),
colly.AllowURLRevisit(),
colly.TraceHTTP(),
)
var charactere character.Character
charactere.ID = id
logger.Info("Waiting for collector")
collector.Wait()
return charactere
}

func (c Controller) ScrapeCharacter(id int64, lang string) character.Character {
if id == 0 {
return character.Character{}
}
logger := logrus.WithField("character", id)
logrus.Infof("Scraping Character %v", id)
collector := colly.NewCollector(
colly.MaxDepth(2),
colly.Async(true),
colly.AllowURLRevisit(),
colly.TraceHTTP(),
)
collector.SetRequestTimeout(60 * time.Second)
if c.proxyfunc != nil {
Expand Down Expand Up @@ -84,8 +104,9 @@ func (c Controller) ScrapeCharacter(id int64) character.Character {
}
}
collector.OnRequest(func(r *colly.Request) {

logger.Debugf("Visiting %s", r.URL.String())
if !(strings.Contains(r.URL.String(), "friend") || strings.Contains(r.URL.String(), "achievement") || r.URL.String() == fmt.Sprintf("%v%v%d", URL, CHARACTERENDPOINT, id)) {
if !(strings.Contains(r.URL.String(), "friend") || strings.Contains(r.URL.String(), "achievement") || r.URL.String() == fmt.Sprintf("%v%v%d", fmt.Sprintf(URL, lang), CHARACTERENDPOINT, id)) {
r.Headers.Set("User-Agent", "Mozilla/5.0 (iPhone; CPU iPhone OS 14_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1")
}
})
Expand All @@ -97,7 +118,7 @@ func (c Controller) ScrapeCharacter(id int64) character.Character {
Expires: expire,
}
temp = append(temp, &cookie)
err := collector.SetCookies(URL, temp)
err := collector.SetCookies(fmt.Sprintf(URL, lang), temp)
if err != nil {
return character.Character{}
}
Expand All @@ -107,17 +128,22 @@ func (c Controller) ScrapeCharacter(id int64) character.Character {
for _, f := range characterHandlers() {
collector.OnHTML(f(&charactere))
}
MAINURL := fmt.Sprintf("%v%v%d", URL, CHARACTERENDPOINT, id)
CLASSURL := fmt.Sprintf("%v%v%d/class_job", URL, CHARACTERENDPOINT, id)
MINIONURL := fmt.Sprintf("%v%v%d/minion", URL, CHARACTERENDPOINT, id)
MOUNTURL := fmt.Sprintf("%v%v%d/mount", URL, CHARACTERENDPOINT, id)
ACHIEVEMENTURL := fmt.Sprintf("%v%v%d/achievement", URL, CHARACTERENDPOINT, id)
FRIENDURL := fmt.Sprintf("%v%v%d/friend", URL, CHARACTERENDPOINT, id)
MAINURL := fmt.Sprintf("%v%v%d", fmt.Sprintf(URL, lang), CHARACTERENDPOINT, id)
CLASSURL := fmt.Sprintf("%v%v%d/class_job", fmt.Sprintf(URL, lang), CHARACTERENDPOINT, id)
MINIONURL := fmt.Sprintf("%v%v%d/minion", fmt.Sprintf(URL, lang), CHARACTERENDPOINT, id)
MOUNTURL := fmt.Sprintf("%v%v%d/mount", fmt.Sprintf(URL, lang), CHARACTERENDPOINT, id)
ACHIEVEMENTURL := fmt.Sprintf("%v%v%d/achievement", fmt.Sprintf(URL, lang), CHARACTERENDPOINT, id)
FRIENDURL := fmt.Sprintf("%v%v%d/friend", fmt.Sprintf(URL, lang), CHARACTERENDPOINT, id)
// Set error handler
achievements := false
friends := false
collector.OnHTML("li.btn__pager__current", func(e *colly.HTMLElement) {
if strings.Contains(e.Text, "Page 1 of") {
if (strings.Contains(e.Request.URL.String(), "achievement") && achievements) || (strings.Contains(e.Request.URL.String(), "friend") && friends) {
return
}
if strings.Contains(e.Text, "Page 1 of") || strings.Contains(e.Text, "Seite 1 (von") || strings.Contains(e.Text, "Page 1 ") || strings.Contains(e.Text, "1ページ") {
tempID, err := strconv.ParseInt(strings.ReplaceAll(After(strings.ReplaceAll(strings.ReplaceAll(e.Text, "ページ", ""), "/", " "), " "), ")", ""), 10, 0)

tempID, err := strconv.ParseInt(After(e.Text, " "), 10, 0)
if err != nil {
logrus.Error("Error while parsing ID ", tempID)
}
Expand All @@ -126,8 +152,10 @@ func (c Controller) ScrapeCharacter(id int64) character.Character {

if strings.Contains(e.Request.URL.String(), "achievement") {
url = fmt.Sprintf("%v/?page=", ACHIEVEMENTURL)
achievements = true
} else if strings.Contains(e.Request.URL.String(), "friend") {
url = fmt.Sprintf("%v/?page=", FRIENDURL)
friends = true
}
for i = 2; i <= tempID; i++ {
// time.Sleep(time.Duration(rand.Intn(3)) * time.Second)
Expand Down
25 changes: 13 additions & 12 deletions controller/freecompanyfunctions.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package controller

import (
"regexp"
"strconv"
"strings"
"time"
Expand All @@ -13,7 +14,7 @@ import (
)

const (
LEADERICON = "https://img.finalfantasyxiv.com/lds/h/Z/W5a6yeRyN2eYiaV-AGU7mJKEhs.png"
LEADERICON = "https://lds-img.finalfantasyxiv.com/h/Z/W5a6yeRyN2eYiaV-AGU7mJKEhs.png"
)

func freecompanyNameHandler(data *freecompany.FreeCompany) (string, func(e *colly.HTMLElement)) {
Expand All @@ -27,11 +28,14 @@ func freecompanyServerHandler(data *freecompany.FreeCompany) (string, func(e *co
return `p.entry__freecompany__gc:has(i)`, func(e *colly.HTMLElement) {
var server model.Server
var datacenter model.Datacenter
datacenter.Name = Between(e.Text, "(", ")")
server.Datacenter = datacenter
server.Name = strings.ReplaceAll(strings.ReplaceAll(BeforeLast(e.Text, "("), "\t", ""), "\n", "")

datacenter.Name = Between(e.Text, "[", "]")
server.Datacenter = datacenter
re := regexp.MustCompile(`[^a-zA-Z]+`)
temp := BeforeLast(e.Text, "[")
server.Name = re.ReplaceAllString(temp, "")
data.Server = &server

}

}
Expand Down Expand Up @@ -71,14 +75,12 @@ func freecompanyGrandcompanyReputationHandler(data *freecompany.FreeCompany) (st
}

func freecompanyRankHandler(data *freecompany.FreeCompany) (string, func(e *colly.HTMLElement)) {
return `h3.heading--lead:contains("Rank")`, func(e *colly.HTMLElement) {
if e.Text == "Rank" {
rank, err := strconv.ParseInt(e.DOM.NextFiltered("p.freecompany__text").Text(), 10, 64)
if err != nil {
logrus.Error("Error while parsing rank ")
}
data.Rank = rank
return `h3.heading--lead:contains("Rank"),h3.heading--lead:contains("Rang"),h3.heading--lead:contains("ランク")`, func(e *colly.HTMLElement) {
rank, err := strconv.ParseInt(e.DOM.NextFiltered("p.freecompany__text").Text(), 10, 64)
if err != nil {
logrus.Error("Error while parsing rank ")
}
data.Rank = rank

}

Expand Down Expand Up @@ -242,7 +244,6 @@ func freecompanyMemberHandler(data *freecompany.FreeCompany) (string, func(e *co
data.LeaderURL = e.Attr("href")
}
}

}
func freecompanyAcceptsHandler(data *freecompany.FreeCompany) (string, func(e *colly.HTMLElement)) {
return `p.freecompany__recruitment`, func(e *colly.HTMLElement) {
Expand Down
16 changes: 11 additions & 5 deletions controller/freecompanyscraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ import (
)

const (
URL = "http://eu.finalfantasyxiv.com"
URL = "http://%v.finalfantasyxiv.com"
FREECOMPANYENDPOINT = "/lodestone/freecompany/"
)

func (c Controller) ScrapeFreecompany(id uint64) freecompany.FreeCompany {
func (c Controller) ScrapeFreecompany(id uint64, lang string) freecompany.FreeCompany {
collector := colly.NewCollector(
colly.MaxDepth(2),
colly.AllowURLRevisit(),
Expand Down Expand Up @@ -74,8 +74,8 @@ func (c Controller) ScrapeFreecompany(id uint64) freecompany.FreeCompany {
for _, f := range freecompanyHandlers() {
collector.OnHTML(f(&company))
}
MAINURL := fmt.Sprintf("%v%v%d", URL, FREECOMPANYENDPOINT, id)
MEMBERURL := fmt.Sprintf("%v%v%d/member", URL, FREECOMPANYENDPOINT, id)
MAINURL := fmt.Sprintf("%v%v%d", fmt.Sprintf(URL, lang), FREECOMPANYENDPOINT, id)
MEMBERURL := fmt.Sprintf("%v%v%d/member", fmt.Sprintf(URL, lang), FREECOMPANYENDPOINT, id)
if c.parallel <= 0 {
err := collector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 3})
if err != nil {
Expand All @@ -87,8 +87,13 @@ func (c Controller) ScrapeFreecompany(id uint64) freecompany.FreeCompany {
logrus.Error("Setting limit failed:", err)
}
}
members := false

collector.OnHTML("li.btn__pager__current", func(e *colly.HTMLElement) {
tempID, err := strconv.ParseInt(After(e.Text, " "), 10, 0)
if members {
return
}
tempID, err := strconv.ParseInt(strings.ReplaceAll(After(After(strings.ReplaceAll(e.Text, "ページ", ""), "/"), " "), ")", ""), 10, 0)
if err != nil {
logrus.Error("Error while parsing ID ", tempID)
}
Expand All @@ -102,6 +107,7 @@ func (c Controller) ScrapeFreecompany(id uint64) freecompany.FreeCompany {
logrus.Println("Visiting failed:", err)
}
}
members = true
}
})

Expand Down
Loading

0 comments on commit 6ca2da0

Please sign in to comment.