From d73b52dfe51bebc60cf115e7752d43d1d73a1a83 Mon Sep 17 00:00:00 2001 From: bluemindset Date: Thu, 12 Dec 2024 23:09:43 +0200 Subject: [PATCH] Support ignoring URL params #90 --- README.md | 2 ++ cmd/crawley/main.go | 10 ++++++++-- internal/crawler/config.go | 29 +++++++++++++++++------------ internal/crawler/options.go | 7 +++++++ internal/crawler/util.go | 10 ++++++++++ 5 files changed, 44 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 5a57900..fcde1da 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,8 @@ possible flags with default values: show version -workers int number of workers (default - number of CPU cores) +-ignore-query + ignore query parameters in URL ``` # flags autocompletion diff --git a/cmd/crawley/main.go b/cmd/crawley/main.go index 670a90f..7d0c2fa 100644 --- a/cmd/crawley/main.go +++ b/cmd/crawley/main.go @@ -42,6 +42,7 @@ var ( fSkipSSL, fScanJS bool fScanCSS, fScanALL bool fSubdomains bool + fIgnoreQuery bool fDirsPolicy, fProxyAuth string fRobotsPolicy, fUA string fDelay time.Duration @@ -165,6 +166,7 @@ func parseFlags() (rv []crawler.Option, err error) { crawler.WithProxyAuth(fProxyAuth), crawler.WithTimeout(fTimeout), crawler.WithSubdomains(fSubdomains), + crawler.WithIgnoreQueryParams(fIgnoreQuery), } return rv, nil @@ -193,6 +195,7 @@ func setupFlags() { flag.BoolVar(&fSkipSSL, "skip-ssl", false, "skip ssl verification") flag.BoolVar(&fSilent, "silent", false, "suppress info and error messages in stderr") flag.BoolVar(&fVersion, "version", false, "show version") + flag.BoolVar(&fIgnoreQuery, "ignore-query", false, "ignore query parameters in URL comparison") flag.StringVar(&fDirsPolicy, "dirs", crawler.DefaultDirsPolicy, "policy for non-resource urls: show / hide / only") @@ -236,8 +239,11 @@ func main() { if fSilent { log.SetOutput(io.Discard) } - - if err := crawl(flag.Arg(0), opts...); err != nil { + uri := flag.Arg(0) + if fIgnoreQuery { + uri = crawler.NormalizeURL(uri) + } + if err := crawl(uri, opts...); err != nil { // forcing back stderr in case of errors, otherwise, if 'silent' is on - no one will knows what happened. log.SetOutput(os.Stderr) log.Fatal("[-] crawler:", err) diff --git a/internal/crawler/config.go b/internal/crawler/config.go index 2465e85..51fdab3 100644 --- a/internal/crawler/config.go +++ b/internal/crawler/config.go @@ -18,18 +18,19 @@ const ( ) type config struct { - AlowedTags []string - Ignored []string - Client client.Config - Delay time.Duration - Depth int - Robots RobotsPolicy - Dirs DirsPolicy - Brute bool - NoHEAD bool - ScanJS bool - ScanCSS bool - Subdomains bool + AlowedTags []string + Ignored []string + Client client.Config + Delay time.Duration + Depth int + Robots RobotsPolicy + Dirs DirsPolicy + Brute bool + NoHEAD bool + ScanJS bool + ScanCSS bool + Subdomains bool + IgnoreQuery bool } func (c *config) validate() { @@ -64,5 +65,9 @@ func (c *config) String() (rv string) { sb.WriteString(" +subdomains") } + if c.IgnoreQuery { + sb.WriteString(" +ignore-query") + } + return sb.String() } diff --git a/internal/crawler/options.go b/internal/crawler/options.go index 7106812..046dbdb 100644 --- a/internal/crawler/options.go +++ b/internal/crawler/options.go @@ -134,3 +134,10 @@ func WithSubdomains(v bool) Option { c.Subdomains = v } } + +// WithIgnoreQueryParams strips query parameters from uri. +func WithIgnoreQueryParams(v bool) Option { + return func(c *config) { + c.IgnoreQuery = v + } +} diff --git a/internal/crawler/util.go b/internal/crawler/util.go index 163ae2d..4e79e6a 100644 --- a/internal/crawler/util.go +++ b/internal/crawler/util.go @@ -241,3 +241,13 @@ func resolveRef(base, uri string) (rv string, ok bool) { return rv, true } + +func NormalizeURL(rawURL string) string { + u, err := url.Parse(rawURL) + if err != nil { + return rawURL + } + u.RawQuery = "" + + return u.String() +}