From 78b2e5038e938a8fd478b73974173dfad5d13e64 Mon Sep 17 00:00:00 2001
From: marybonilla2231 <marybonilla2231@gmail.com>
Date: Sun, 9 Jun 2024 18:59:18 +0300
Subject: [PATCH] Support subdomains (#91)

* Support subdomains. closes #89
---
 README.md                     |  3 +++
 cmd/crawley/main.go           |  3 +++
 internal/crawler/config.go    |  5 +++++
 internal/crawler/crawler.go   |  2 +-
 internal/crawler/options.go   |  7 +++++++
 internal/crawler/util.go      | 24 +++++++++++++++++++++---
 internal/crawler/util_test.go | 35 ++++++++++++++++++++++-------------
 7 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 94d126b..5a57900 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ Crawls web pages and prints any link it can find.
 - user-defined headers, same as curl: `-header "ONE: 1" -header "TWO: 2" -header @headers-file`
 - tag filter - allow to specify tags to crawl for (single: `-tag a -tag form`, multiple: `-tag a,form`, or mixed)
 - url ignore - allow to ignore urls with matched substrings from crawling (i.e.: `-ignore logout`)
+- subdomains support - allow depth crawling for subdomains as well (e.g. `crawley http://some-test.site` will be able to crawl `http://www.some-test.site`)
 
 # examples
 
@@ -94,6 +95,8 @@ possible flags with default values:
     suppress info and error messages in stderr
 -skip-ssl
     skip ssl verification
+-subdomains
+    support subdomains (e.g. if www.domain.com found, recurse over it)
 -tag value
     tags filter, single or comma-separated tag names
 -timeout duration
diff --git a/cmd/crawley/main.go b/cmd/crawley/main.go
index 75d4441..26ebf9a 100644
--- a/cmd/crawley/main.go
+++ b/cmd/crawley/main.go
@@ -41,6 +41,7 @@ var (
 	fBrute, fNoHeads        bool
 	fSkipSSL, fScanJS       bool
 	fScanCSS, fScanALL      bool
+	fSubdomains				bool
 	fDirsPolicy, fProxyAuth string
 	fRobotsPolicy, fUA      string
 	fDelay                  time.Duration
@@ -163,6 +164,7 @@ func parseFlags() (rv []crawler.Option, err error) {
 		crawler.WithIgnored(ignored.Values),
 		crawler.WithProxyAuth(fProxyAuth),
 		crawler.WithTimeout(fTimeout),
+		crawler.WithSubdomains(fSubdomains),
 	}
 
 	return rv, nil
@@ -184,6 +186,7 @@ func setupFlags() {
 
 	flag.BoolVar(&fScanALL, "all", false, "scan all known sources (js/css/...)")
 	flag.BoolVar(&fBrute, "brute", false, "scan html comments")
+	flag.BoolVar(&fSubdomains, "subdomains", false, "Support subdomains (e.g. if www.domain.com found, recurse over it)")
 	flag.BoolVar(&fScanCSS, "css", false, "scan css for urls")
 	flag.BoolVar(&fNoHeads, "headless", false, "disable pre-flight HEAD requests")
 	flag.BoolVar(&fScanJS, "js", false, "scan js code for endpoints")
diff --git a/internal/crawler/config.go b/internal/crawler/config.go
index 573c53f..3c3793c 100644
--- a/internal/crawler/config.go
+++ b/internal/crawler/config.go
@@ -29,6 +29,7 @@ type config struct {
 	NoHEAD     bool
 	ScanJS     bool
 	ScanCSS    bool
+	Subdomains bool
 }
 
 func (c *config) validate() {
@@ -59,5 +60,9 @@ func (c *config) String() (rv string) {
 		sb.WriteString(" +css")
 	}
 
+	if c.Subdomains{
+		sb.WriteString(" +subdomains")
+	}
+
 	return sb.String()
 }
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index cbcb6b7..89e0a0a 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -175,7 +175,7 @@ func (c *Crawler) tryEnqueue(base *url.URL, r *crawlResult) (yes bool) {
 		return
 	}
 
-	if !canCrawl(base, u, c.cfg.Depth) ||
+	if !canCrawl(base, u, c.cfg.Depth, c.cfg.Subdomains) ||
 		c.robots.Forbidden(u.Path) ||
 		(c.cfg.Dirs == DirsOnly && isResorce(u.Path)) {
 		return
diff --git a/internal/crawler/options.go b/internal/crawler/options.go
index d337840..7106812 100644
--- a/internal/crawler/options.go
+++ b/internal/crawler/options.go
@@ -127,3 +127,10 @@ func WithScanCSS(v bool) Option {
 		c.ScanCSS = v
 	}
 }
+
+// WithSubdomins enables subdomains scanning.
+func WithSubdomains(v bool) Option {
+	return func(c *config) {
+		c.Subdomains = v
+	}
+}
diff --git a/internal/crawler/util.go b/internal/crawler/util.go
index 59dbb75..9926a1e 100644
--- a/internal/crawler/util.go
+++ b/internal/crawler/util.go
@@ -71,9 +71,27 @@ func prepareFilter(tags []string) links.TokenFilter {
 	}
 }
 
-func canCrawl(a, b *url.URL, d int) (yes bool) {
+func canCrawl(a, b *url.URL, d int, subdomains bool) (yes bool) {
 	if a.Host != b.Host {
-		return
+		if subdomains{
+			domainA := strings.Split(a.Host, ".")
+			domainB := strings.Split(b.Host, ".")
+			if len(domainA) >= len(domainB){
+				// The base domain must be shorter than the found domain
+				return
+			}
+			j := len(domainB) - 1
+			for i := len(domainA) - 1; i >= 0 && j >= 0; i-- {
+				// Traverse each domain from the end, to check if their top-level domain are the same
+				if domainA[i] != domainB[j] {
+					// not the same top-level host
+					return
+				}
+				j--
+			}
+		} else{
+			return
+		}
 	}
 
 	var apath, bpath string
@@ -104,7 +122,7 @@ func relativeDepth(base, sub string) (n int, ok bool) {
 		sn = path.Clean(sub)
 	)
 
-	if len(sn) <= len(bn) {
+	if len(sn) < len(bn) {
 		return
 	}
 
diff --git a/internal/crawler/util_test.go b/internal/crawler/util_test.go
index 6c3e8c7..26b19e7 100644
--- a/internal/crawler/util_test.go
+++ b/internal/crawler/util_test.go
@@ -120,6 +120,7 @@ func TestCanCrawl(t *testing.T) {
 		b *url.URL
 		u *url.URL
 		d int
+		subdomains bool
 	}
 
 	base, _ := url.Parse("http://test/some/path")
@@ -128,31 +129,39 @@ func TestCanCrawl(t *testing.T) {
 	url1, _ := url.Parse("http://test/some/path/even")
 	url2, _ := url.Parse("http://test/some/path/even/more")
 	url3, _ := url.Parse("http://test")
+	url4, _ := url.Parse("http://abc.test/some")
+	url5, _ := url.Parse("http://abc.test/some/path")
+	url6, _ := url.Parse("http://abc.test/some/path/even")
 
 	tests := []struct {
 		name    string
 		args    args
 		wantYes bool
 	}{
-		{"url0-1", args{b: base, u: url0, d: 1}, false},
-		{"url1-0", args{b: base, u: url1, d: 0}, false},
-		{"url1-1", args{b: base, u: url1, d: 1}, true},
-		{"url2-0", args{b: base, u: url2, d: 0}, false},
-		{"url2-1", args{b: base, u: url2, d: 1}, false},
-		{"url2-2", args{b: base, u: url2, d: 2}, true},
-		{"url2-3", args{b: base, u: url2, d: 3}, true},
-		{"badh-1", args{b: base, u: badh, d: 1}, false},
-		{"url2-0-1", args{b: base, u: url0, d: -1}, false},
-		{"url2-1-1", args{b: base, u: url1, d: -1}, true},
-		{"url2-2-1", args{b: base, u: url2, d: -1}, true},
-		{"url3-3", args{b: base, u: url3, d: 0}, false},
+		{"url0-1", args{b: base, u: url0, d: 1, subdomains: false}, false},
+		{"url1-0", args{b: base, u: url1, d: 0, subdomains: false}, false},
+		{"url1-1", args{b: base, u: url1, d: 1, subdomains: false}, true},
+		{"url2-0", args{b: base, u: url2, d: 0, subdomains: false}, false},
+		{"url2-1", args{b: base, u: url2, d: 1, subdomains: false}, false},
+		{"url2-2", args{b: base, u: url2, d: 2, subdomains: false}, true},
+		{"url2-3", args{b: base, u: url2, d: 3, subdomains: false}, true},
+		{"badh-1", args{b: base, u: badh, d: 1, subdomains: false}, false},
+		{"url2-0-1", args{b: base, u: url0, d: -1, subdomains: false}, false},
+		{"url2-1-1", args{b: base, u: url1, d: -1, subdomains: false}, true},
+		{"url2-2-1", args{b: base, u: url2, d: -1, subdomains: false}, true},
+		{"url3-3", args{b: base, u: url3, d: 0, subdomains: false}, false},
+		{"url4-1", args{b: base, u: url4, d: 1000, subdomains: true}, false},
+		{"url5-1", args{b: base, u: url5, d: -1, subdomains: true}, true},
+		{"url5-2", args{b: base, u: url5, d: -1, subdomains: false}, false},
+		{"url6-1", args{b: base, u: url6, d: 1, subdomains: true}, true},
+		{"url6-2", args{b: base, u: url6, d: 0, subdomains: true}, false},
 	}
 
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
 			t.Parallel()
 
-			if gotYes := canCrawl(tc.args.b, tc.args.u, tc.args.d); gotYes != tc.wantYes {
+			if gotYes := canCrawl(tc.args.b, tc.args.u, tc.args.d, tc.args.subdomains); gotYes != tc.wantYes {
 				t.Errorf("canCrawl() = %v, want %v", gotYes, tc.wantYes)
 			}
 		})