Skip to content

Commit

Permalink
Merge pull request #7 from paulvollmer/filesource
Browse files Browse the repository at this point in the history
File as source
  • Loading branch information
paulvollmer authored Sep 6, 2019
2 parents e5e5321 + cd4cda3 commit e555c77
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 21 deletions.
1 change: 1 addition & 0 deletions .goreleaser.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ builds:
goos:
- darwin
- linux
- windows
goarch:
- amd64
- 386
11 changes: 7 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION=0.1.1
VERSION=0.2.0

all: lint test

Expand All @@ -11,15 +11,18 @@ lint:

test: build
@./htmltable2csv -v
@./htmltable2csv -url "https://www.w3schools.com/html/html_tables.asp" -selector "#customers > tbody > tr" -csv data.csv
@./htmltable2csv -source "./scraper/fixture/test1.html" -selector "table > tbody > tr" -csv data_file.csv
@./htmltable2csv -source "https://www.w3schools.com/html/html_tables.asp" -selector "#customers > tbody > tr" -csv data_url.csv

test-all:
@go test all
@go test ./...
@make test

release:
git tag -a v${VERSION} -m "Version ${VERSION}"
git push origin v${VERSION}
goreleaser
release-dry:
goreleaser --skip-publish --skip-validate --snapshot

.PHONY: all lint build test test-all release
.PHONY: all lint build test test-all release release-dry
8 changes: 4 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func usage() {

func main() {
flagVersion := flag.Bool("v", false, "Print the version and exit")
flagURL := flag.String("url", "", "The website url")
flagSource := flag.String("source", "", "The filepath or website url")
flagSelector := flag.String("selector", "", "The table css selector")
flagCSV := flag.String("csv", "", "The csv filename. if empty, print csv to stdout")
flag.Usage = usage
Expand All @@ -38,8 +38,8 @@ func main() {
os.Exit(0)
}

if *flagURL == "" {
fmt.Println("Flag -url cannot be empty")
if *flagSource == "" {
fmt.Println("Flag -source cannot be empty")
os.Exit(1)
}

Expand All @@ -50,7 +50,7 @@ func main() {

var err error
scraper := htmltable2csv.Scraper{}
scraper.URL = *flagURL
scraper.Source = *flagSource
scraper.Selector = *flagSelector
_, err = scraper.Scrape()
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "htmltable2csv",
"version": "0.1.0",
"version": "0.2.0",
"description": "htmltable2csv is a tool to parse a html table and store the data as csv. It can be written to a file or print out to stdout",
"scripts": {
"test": "make test"
Expand Down
22 changes: 22 additions & 0 deletions scraper/fixture/test1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<table>
<thead>
<tr>
<td>key</td>
<td>value</td>
</tr>
</thead>
<tbody>
<tr>
<td>foo</td>
<td>1</td>
</tr>
<tr>
<td>bar</td>
<td>2</td>
</tr>
<tr>
<td>baz</td>
<td>3</td>
</tr>
</tbody>
</table>
44 changes: 32 additions & 12 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,53 @@ import (
"fmt"
"io"
"net/http"
"net/url"
"os"

"github.com/PuerkitoBio/goquery"
)

// Scraper store the URL, Selector and collected Data
// Scraper store the Source, Selector and collected Data
type Scraper struct {
URL string
Source string
Selector string
Data [][]string
}

// Scrape download and parse the table data
func (s *Scraper) Scrape() ([][]string, error) {
var data = make([][]string, 0)
res, err := http.Get(s.URL)
if err != nil {
return data, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return data, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)

var doc goquery.Document

_, err := url.ParseRequestURI(s.Source)
if err != nil {
return data, err
f, err := os.Open(s.Source)
if err != nil {
return data, err
}
defer f.Close()
tmp, err := goquery.NewDocumentFromReader(f)
if err != nil {
return data, err
}
doc = *tmp
} else {
res, err := http.Get(s.Source)
if err != nil {
return data, err
}
defer res.Body.Close()
if res.StatusCode != 200 {
return data, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
}
tmp, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return data, err
}
doc = *tmp
}

// Find the table
doc.Find(s.Selector).Each(func(i int, table *goquery.Selection) {
dataRow := make([]string, 0)
Expand Down
94 changes: 94 additions & 0 deletions scraper/scraper_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package htmltable2csv

import (
"net/http"
"net/http/httptest"
"testing"
)

func TestScraper(t *testing.T) {
t.Run("source file", func(t *testing.T) {
scraper := Scraper{}
scraper.Source = "./fixture/test1.html"
scraper.Selector = "table > tbody > tr"
data, err := scraper.Scrape()
if err != nil {
t.Error(err)
}
dataEqual(t, data)
})

t.Run("source url", func(t *testing.T) {
// Start a local HTTP server
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
rw.Write([]byte(`<table>
<thead>
<tr>
<td>key</td>
<td>value</td>
</tr>
</thead>
<tbody>
<tr>
<td>foo</td>
<td>1</td>
</tr>
<tr>
<td>bar</td>
<td>2</td>
</tr>
<tr>
<td>baz</td>
<td>3</td>
</tr>
</tbody>
</table>`))
}))
defer server.Close()

scraper := Scraper{}
scraper.Source = server.URL
scraper.Selector = "table > tbody > tr"
data, err := scraper.Scrape()
if err != nil {
t.Error(err)
}
dataEqual(t, data)
})
}

func dataEqual(t *testing.T, data [][]string) {
if len(data) != 3 {
t.Error("data not equal")
}

if len(data[0]) != 2 {
t.Error("data[0] not equal")
}
if data[0][0] != "foo" {
t.Error("data[0][0] not equal")
}
if data[0][1] != "1" {
t.Error("data[0][1] not equal")
}

if len(data[1]) != 2 {
t.Error("data[1] not equal")
}
if data[1][0] != "bar" {
t.Error("data[1][0] not equal")
}
if data[1][1] != "2" {
t.Error("data[1][1] not equal")
}

if len(data[2]) != 2 {
t.Error("data[2] not equal")
}
if data[2][0] != "baz" {
t.Error("data[2][0] not equal")
}
if data[2][1] != "3" {
t.Error("data[2][0] not equal")
}
}

0 comments on commit e555c77

Please sign in to comment.