From 78b2e5038e938a8fd478b73974173dfad5d13e64 Mon Sep 17 00:00:00 2001 From: marybonilla2231 Date: Sun, 9 Jun 2024 18:59:18 +0300 Subject: [PATCH] Support subdomains (#91) * Support subdomains. closes #89 --- README.md | 3 +++ cmd/crawley/main.go | 3 +++ internal/crawler/config.go | 5 +++++ internal/crawler/crawler.go | 2 +- internal/crawler/options.go | 7 +++++++ internal/crawler/util.go | 24 +++++++++++++++++++++--- internal/crawler/util_test.go | 35 ++++++++++++++++++++++------------- 7 files changed, 62 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 94d126b..5a57900 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Crawls web pages and prints any link it can find. - user-defined headers, same as curl: `-header "ONE: 1" -header "TWO: 2" -header @headers-file` - tag filter - allow to specify tags to crawl for (single: `-tag a -tag form`, multiple: `-tag a,form`, or mixed) - url ignore - allow to ignore urls with matched substrings from crawling (i.e.: `-ignore logout`) +- subdomains support - allow depth crawling for subdomains as well (e.g. `crawley http://some-test.site` will be able to crawl `http://www.some-test.site`) # examples @@ -94,6 +95,8 @@ possible flags with default values: suppress info and error messages in stderr -skip-ssl skip ssl verification +-subdomains + support subdomains (e.g. if www.domain.com found, recurse over it) -tag value tags filter, single or comma-separated tag names -timeout duration diff --git a/cmd/crawley/main.go b/cmd/crawley/main.go index 75d4441..26ebf9a 100644 --- a/cmd/crawley/main.go +++ b/cmd/crawley/main.go @@ -41,6 +41,7 @@ var ( fBrute, fNoHeads bool fSkipSSL, fScanJS bool fScanCSS, fScanALL bool + fSubdomains bool fDirsPolicy, fProxyAuth string fRobotsPolicy, fUA string fDelay time.Duration @@ -163,6 +164,7 @@ func parseFlags() (rv []crawler.Option, err error) { crawler.WithIgnored(ignored.Values), crawler.WithProxyAuth(fProxyAuth), crawler.WithTimeout(fTimeout), + crawler.WithSubdomains(fSubdomains), } return rv, nil @@ -184,6 +186,7 @@ func setupFlags() { flag.BoolVar(&fScanALL, "all", false, "scan all known sources (js/css/...)") flag.BoolVar(&fBrute, "brute", false, "scan html comments") + flag.BoolVar(&fSubdomains, "subdomains", false, "Support subdomains (e.g. if www.domain.com found, recurse over it)") flag.BoolVar(&fScanCSS, "css", false, "scan css for urls") flag.BoolVar(&fNoHeads, "headless", false, "disable pre-flight HEAD requests") flag.BoolVar(&fScanJS, "js", false, "scan js code for endpoints") diff --git a/internal/crawler/config.go b/internal/crawler/config.go index 573c53f..3c3793c 100644 --- a/internal/crawler/config.go +++ b/internal/crawler/config.go @@ -29,6 +29,7 @@ type config struct { NoHEAD bool ScanJS bool ScanCSS bool + Subdomains bool } func (c *config) validate() { @@ -59,5 +60,9 @@ func (c *config) String() (rv string) { sb.WriteString(" +css") } + if c.Subdomains{ + sb.WriteString(" +subdomains") + } + return sb.String() } diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index cbcb6b7..89e0a0a 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -175,7 +175,7 @@ func (c *Crawler) tryEnqueue(base *url.URL, r *crawlResult) (yes bool) { return } - if !canCrawl(base, u, c.cfg.Depth) || + if !canCrawl(base, u, c.cfg.Depth, c.cfg.Subdomains) || c.robots.Forbidden(u.Path) || (c.cfg.Dirs == DirsOnly && isResorce(u.Path)) { return diff --git a/internal/crawler/options.go b/internal/crawler/options.go index d337840..7106812 100644 --- a/internal/crawler/options.go +++ b/internal/crawler/options.go @@ -127,3 +127,10 @@ func WithScanCSS(v bool) Option { c.ScanCSS = v } } + +// WithSubdomins enables subdomains scanning. +func WithSubdomains(v bool) Option { + return func(c *config) { + c.Subdomains = v + } +} diff --git a/internal/crawler/util.go b/internal/crawler/util.go index 59dbb75..9926a1e 100644 --- a/internal/crawler/util.go +++ b/internal/crawler/util.go @@ -71,9 +71,27 @@ func prepareFilter(tags []string) links.TokenFilter { } } -func canCrawl(a, b *url.URL, d int) (yes bool) { +func canCrawl(a, b *url.URL, d int, subdomains bool) (yes bool) { if a.Host != b.Host { - return + if subdomains{ + domainA := strings.Split(a.Host, ".") + domainB := strings.Split(b.Host, ".") + if len(domainA) >= len(domainB){ + // The base domain must be shorter than the found domain + return + } + j := len(domainB) - 1 + for i := len(domainA) - 1; i >= 0 && j >= 0; i-- { + // Traverse each domain from the end, to check if their top-level domain are the same + if domainA[i] != domainB[j] { + // not the same top-level host + return + } + j-- + } + } else{ + return + } } var apath, bpath string @@ -104,7 +122,7 @@ func relativeDepth(base, sub string) (n int, ok bool) { sn = path.Clean(sub) ) - if len(sn) <= len(bn) { + if len(sn) < len(bn) { return } diff --git a/internal/crawler/util_test.go b/internal/crawler/util_test.go index 6c3e8c7..26b19e7 100644 --- a/internal/crawler/util_test.go +++ b/internal/crawler/util_test.go @@ -120,6 +120,7 @@ func TestCanCrawl(t *testing.T) { b *url.URL u *url.URL d int + subdomains bool } base, _ := url.Parse("http://test/some/path") @@ -128,31 +129,39 @@ func TestCanCrawl(t *testing.T) { url1, _ := url.Parse("http://test/some/path/even") url2, _ := url.Parse("http://test/some/path/even/more") url3, _ := url.Parse("http://test") + url4, _ := url.Parse("http://abc.test/some") + url5, _ := url.Parse("http://abc.test/some/path") + url6, _ := url.Parse("http://abc.test/some/path/even") tests := []struct { name string args args wantYes bool }{ - {"url0-1", args{b: base, u: url0, d: 1}, false}, - {"url1-0", args{b: base, u: url1, d: 0}, false}, - {"url1-1", args{b: base, u: url1, d: 1}, true}, - {"url2-0", args{b: base, u: url2, d: 0}, false}, - {"url2-1", args{b: base, u: url2, d: 1}, false}, - {"url2-2", args{b: base, u: url2, d: 2}, true}, - {"url2-3", args{b: base, u: url2, d: 3}, true}, - {"badh-1", args{b: base, u: badh, d: 1}, false}, - {"url2-0-1", args{b: base, u: url0, d: -1}, false}, - {"url2-1-1", args{b: base, u: url1, d: -1}, true}, - {"url2-2-1", args{b: base, u: url2, d: -1}, true}, - {"url3-3", args{b: base, u: url3, d: 0}, false}, + {"url0-1", args{b: base, u: url0, d: 1, subdomains: false}, false}, + {"url1-0", args{b: base, u: url1, d: 0, subdomains: false}, false}, + {"url1-1", args{b: base, u: url1, d: 1, subdomains: false}, true}, + {"url2-0", args{b: base, u: url2, d: 0, subdomains: false}, false}, + {"url2-1", args{b: base, u: url2, d: 1, subdomains: false}, false}, + {"url2-2", args{b: base, u: url2, d: 2, subdomains: false}, true}, + {"url2-3", args{b: base, u: url2, d: 3, subdomains: false}, true}, + {"badh-1", args{b: base, u: badh, d: 1, subdomains: false}, false}, + {"url2-0-1", args{b: base, u: url0, d: -1, subdomains: false}, false}, + {"url2-1-1", args{b: base, u: url1, d: -1, subdomains: false}, true}, + {"url2-2-1", args{b: base, u: url2, d: -1, subdomains: false}, true}, + {"url3-3", args{b: base, u: url3, d: 0, subdomains: false}, false}, + {"url4-1", args{b: base, u: url4, d: 1000, subdomains: true}, false}, + {"url5-1", args{b: base, u: url5, d: -1, subdomains: true}, true}, + {"url5-2", args{b: base, u: url5, d: -1, subdomains: false}, false}, + {"url6-1", args{b: base, u: url6, d: 1, subdomains: true}, true}, + {"url6-2", args{b: base, u: url6, d: 0, subdomains: true}, false}, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { t.Parallel() - if gotYes := canCrawl(tc.args.b, tc.args.u, tc.args.d); gotYes != tc.wantYes { + if gotYes := canCrawl(tc.args.b, tc.args.u, tc.args.d, tc.args.subdomains); gotYes != tc.wantYes { t.Errorf("canCrawl() = %v, want %v", gotYes, tc.wantYes) } })