Skip to content

Commit

Permalink
Support subdomains (#91)
Browse files Browse the repository at this point in the history
* Support subdomains. closes #89
  • Loading branch information
marybonilla2231 authored Jun 9, 2024
1 parent d2b7d0e commit 78b2e50
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 17 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Crawls web pages and prints any link it can find.
- user-defined headers, same as curl: `-header "ONE: 1" -header "TWO: 2" -header @headers-file`
- tag filter - allow to specify tags to crawl for (single: `-tag a -tag form`, multiple: `-tag a,form`, or mixed)
- url ignore - allow to ignore urls with matched substrings from crawling (i.e.: `-ignore logout`)
- subdomains support - allow depth crawling for subdomains as well (e.g. `crawley http://some-test.site` will be able to crawl `http://www.some-test.site`)

# examples

Expand Down Expand Up @@ -94,6 +95,8 @@ possible flags with default values:
suppress info and error messages in stderr
-skip-ssl
skip ssl verification
-subdomains
support subdomains (e.g. if www.domain.com found, recurse over it)
-tag value
tags filter, single or comma-separated tag names
-timeout duration
Expand Down
3 changes: 3 additions & 0 deletions cmd/crawley/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ var (
fBrute, fNoHeads bool
fSkipSSL, fScanJS bool
fScanCSS, fScanALL bool
fSubdomains bool
fDirsPolicy, fProxyAuth string
fRobotsPolicy, fUA string
fDelay time.Duration
Expand Down Expand Up @@ -163,6 +164,7 @@ func parseFlags() (rv []crawler.Option, err error) {
crawler.WithIgnored(ignored.Values),
crawler.WithProxyAuth(fProxyAuth),
crawler.WithTimeout(fTimeout),
crawler.WithSubdomains(fSubdomains),
}

return rv, nil
Expand All @@ -184,6 +186,7 @@ func setupFlags() {

flag.BoolVar(&fScanALL, "all", false, "scan all known sources (js/css/...)")
flag.BoolVar(&fBrute, "brute", false, "scan html comments")
flag.BoolVar(&fSubdomains, "subdomains", false, "Support subdomains (e.g. if www.domain.com found, recurse over it)")
flag.BoolVar(&fScanCSS, "css", false, "scan css for urls")
flag.BoolVar(&fNoHeads, "headless", false, "disable pre-flight HEAD requests")
flag.BoolVar(&fScanJS, "js", false, "scan js code for endpoints")
Expand Down
5 changes: 5 additions & 0 deletions internal/crawler/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type config struct {
NoHEAD bool
ScanJS bool
ScanCSS bool
Subdomains bool
}

func (c *config) validate() {
Expand Down Expand Up @@ -59,5 +60,9 @@ func (c *config) String() (rv string) {
sb.WriteString(" +css")
}

if c.Subdomains{

Check failure on line 63 in internal/crawler/config.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofmt`-ed with `-s` (gofmt)
sb.WriteString(" +subdomains")
}

return sb.String()
}
2 changes: 1 addition & 1 deletion internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ func (c *Crawler) tryEnqueue(base *url.URL, r *crawlResult) (yes bool) {
return
}

if !canCrawl(base, u, c.cfg.Depth) ||
if !canCrawl(base, u, c.cfg.Depth, c.cfg.Subdomains) ||
c.robots.Forbidden(u.Path) ||
(c.cfg.Dirs == DirsOnly && isResorce(u.Path)) {
return
Expand Down
7 changes: 7 additions & 0 deletions internal/crawler/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,10 @@ func WithScanCSS(v bool) Option {
c.ScanCSS = v
}
}

// WithSubdomins enables subdomains scanning.
func WithSubdomains(v bool) Option {
return func(c *config) {
c.Subdomains = v
}
}
24 changes: 21 additions & 3 deletions internal/crawler/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,27 @@ func prepareFilter(tags []string) links.TokenFilter {
}
}

func canCrawl(a, b *url.URL, d int) (yes bool) {
func canCrawl(a, b *url.URL, d int, subdomains bool) (yes bool) {
if a.Host != b.Host {

Check failure on line 75 in internal/crawler/util.go

View workflow job for this annotation

GitHub Actions / lint

`if a.Host != b.Host` has complex nested blocks (complexity: 6) (nestif)
return
if subdomains{

Check failure on line 76 in internal/crawler/util.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofmt`-ed with `-s` (gofmt)
domainA := strings.Split(a.Host, ".")
domainB := strings.Split(b.Host, ".")
if len(domainA) >= len(domainB){
// The base domain must be shorter than the found domain
return

Check failure on line 81 in internal/crawler/util.go

View workflow job for this annotation

GitHub Actions / lint

naked return in func `canCrawl` with 43 lines of code (nakedret)
}
j := len(domainB) - 1

Check failure on line 83 in internal/crawler/util.go

View workflow job for this annotation

GitHub Actions / lint

assignments should only be cuddled with other assignments (wsl)
for i := len(domainA) - 1; i >= 0 && j >= 0; i-- {

Check failure on line 84 in internal/crawler/util.go

View workflow job for this annotation

GitHub Actions / lint

only one cuddle assignment allowed before for statement (wsl)
// Traverse each domain from the end, to check if their top-level domain are the same
if domainA[i] != domainB[j] {
// not the same top-level host
return

Check failure on line 88 in internal/crawler/util.go

View workflow job for this annotation

GitHub Actions / lint

naked return in func `canCrawl` with 43 lines of code (nakedret)
}
j--
}
} else{
return
}
}

var apath, bpath string
Expand Down Expand Up @@ -104,7 +122,7 @@ func relativeDepth(base, sub string) (n int, ok bool) {
sn = path.Clean(sub)
)

if len(sn) <= len(bn) {
if len(sn) < len(bn) {
return
}

Expand Down
35 changes: 22 additions & 13 deletions internal/crawler/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ func TestCanCrawl(t *testing.T) {
b *url.URL
u *url.URL
d int
subdomains bool
}

base, _ := url.Parse("http://test/some/path")
Expand All @@ -128,31 +129,39 @@ func TestCanCrawl(t *testing.T) {
url1, _ := url.Parse("http://test/some/path/even")
url2, _ := url.Parse("http://test/some/path/even/more")
url3, _ := url.Parse("http://test")
url4, _ := url.Parse("http://abc.test/some")
url5, _ := url.Parse("http://abc.test/some/path")
url6, _ := url.Parse("http://abc.test/some/path/even")

tests := []struct {
name string
args args
wantYes bool
}{
{"url0-1", args{b: base, u: url0, d: 1}, false},
{"url1-0", args{b: base, u: url1, d: 0}, false},
{"url1-1", args{b: base, u: url1, d: 1}, true},
{"url2-0", args{b: base, u: url2, d: 0}, false},
{"url2-1", args{b: base, u: url2, d: 1}, false},
{"url2-2", args{b: base, u: url2, d: 2}, true},
{"url2-3", args{b: base, u: url2, d: 3}, true},
{"badh-1", args{b: base, u: badh, d: 1}, false},
{"url2-0-1", args{b: base, u: url0, d: -1}, false},
{"url2-1-1", args{b: base, u: url1, d: -1}, true},
{"url2-2-1", args{b: base, u: url2, d: -1}, true},
{"url3-3", args{b: base, u: url3, d: 0}, false},
{"url0-1", args{b: base, u: url0, d: 1, subdomains: false}, false},
{"url1-0", args{b: base, u: url1, d: 0, subdomains: false}, false},
{"url1-1", args{b: base, u: url1, d: 1, subdomains: false}, true},
{"url2-0", args{b: base, u: url2, d: 0, subdomains: false}, false},
{"url2-1", args{b: base, u: url2, d: 1, subdomains: false}, false},
{"url2-2", args{b: base, u: url2, d: 2, subdomains: false}, true},
{"url2-3", args{b: base, u: url2, d: 3, subdomains: false}, true},
{"badh-1", args{b: base, u: badh, d: 1, subdomains: false}, false},
{"url2-0-1", args{b: base, u: url0, d: -1, subdomains: false}, false},
{"url2-1-1", args{b: base, u: url1, d: -1, subdomains: false}, true},
{"url2-2-1", args{b: base, u: url2, d: -1, subdomains: false}, true},
{"url3-3", args{b: base, u: url3, d: 0, subdomains: false}, false},
{"url4-1", args{b: base, u: url4, d: 1000, subdomains: true}, false},
{"url5-1", args{b: base, u: url5, d: -1, subdomains: true}, true},
{"url5-2", args{b: base, u: url5, d: -1, subdomains: false}, false},
{"url6-1", args{b: base, u: url6, d: 1, subdomains: true}, true},
{"url6-2", args{b: base, u: url6, d: 0, subdomains: true}, false},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()

if gotYes := canCrawl(tc.args.b, tc.args.u, tc.args.d); gotYes != tc.wantYes {
if gotYes := canCrawl(tc.args.b, tc.args.u, tc.args.d, tc.args.subdomains); gotYes != tc.wantYes {
t.Errorf("canCrawl() = %v, want %v", gotYes, tc.wantYes)
}
})
Expand Down

0 comments on commit 78b2e50

Please sign in to comment.