From d5f8b28d98fe29aaf6b24fccb0331f21b1b9ceca Mon Sep 17 00:00:00 2001 From: SimonC-Audigent Date: Tue, 17 Sep 2024 10:06:20 +0100 Subject: [PATCH 1/2] add explicit disallow feature --- README.md | 6 ++++-- Robots.js | 52 ++++++++++++++++++++++++++++++++++++++++++++------ index.d.ts | 2 +- test/Robots.js | 38 ++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index fc723b8..47c3f32 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The parser currently supports: - User-agent: - Allow: -- Disallow: +- Disallow (with explicit mode support): - Sitemap: - Crawl-delay: - Host: @@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [ robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true +robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false robots.getCrawlDelay('Sams-Bot/1.0'); // 1 robots.getSitemaps(); // ['http://example.com/sitemap.xml'] robots.getPreferredHost(); // example.com @@ -54,11 +55,12 @@ Returns true if crawling the specified URL is allowed for the specified user-age This will return `undefined` if the URL isn't valid for this robots.txt. -### isDisallowed(url, [ua]) +### isDisallowed(url, [ua], [explicit]) **boolean or undefined** Returns true if crawling the specified URL is not allowed for the specified user-agent. +In explicit mode, user agents wildcards are discarded. This will return `undefined` if the URL isn't valid for this robots.txt. diff --git a/Robots.js b/Robots.js index f0a8e9e..db26a93 100644 --- a/Robots.js +++ b/Robots.js @@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) { this._preferredHost = url; }; -Robots.prototype._getRule = function (url, ua) { +Robots.prototype._getRule = function (url, ua, explicit) { var parsedUrl = parseUrl(url) || {}; var userAgent = formatUserAgent(ua || '*'); @@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) { return; } - var rules = this._rules[userAgent] || this._rules['*'] || []; + var rules = this._rules[userAgent]; + if (!explicit) { + rules = rules || this._rules['*'] + } + rules = rules || [] + var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); var rule = findRule(path, rules); @@ -422,16 +427,51 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) { }; /** - * Returns the opposite of isAllowed() - * + * In standard mode, it returns the opposite of is allowed(). + * In explicit mode, it will return: + * - true if the the agent is explicitly disallowed (wildcard non included), + * - throws an error if the user agent is not specified, + * - and false otherwise. * @param {string} url * @param {string} ua * @return {boolean} */ -Robots.prototype.isDisallowed = function (url, ua) { - return !this.isAllowed(url, ua); +Robots.prototype.isDisallowed = function (url, ua, explicit) { + if ((explicit === true) && (ua === undefined)) { + throw new Error("User Agent must be specified in explicit mode") + } + + var rule = this._getRule(url, ua, explicit); + if (typeof rule === 'undefined') { + return true; + } + return !(!rule || rule.allow); }; +Robots.prototype.isExplicitlyDisallowed = function(url, ua) { + var parsedUrl = parseUrl(url) || {}; + var userAgent = formatUserAgent(ua); + + // The base URL must match otherwise this robots.txt is not valid for it. + if ( + parsedUrl.protocol !== this._url.protocol || + parsedUrl.hostname !== this._url.hostname || + parsedUrl.port !== this._url.port + ) { + return; + } + + var rules = this._rules[userAgent] || []; + var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); + var rule = findRule(path, rules); + + if (typeof rule === 'undefined') { + return; + } + + return !(!rule || rule.allow); +} + /** * Gets the crawl delay if there is one. * diff --git a/index.d.ts b/index.d.ts index 5446898..852ddec 100644 --- a/index.d.ts +++ b/index.d.ts @@ -2,7 +2,7 @@ declare module 'robots-parser'; interface Robot { isAllowed(url: string, ua?: string): boolean | undefined; - isDisallowed(url: string, ua?: string): boolean | undefined; + isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined; getMatchingLineNumber(url: string, ua?: string): number; getCrawlDelay(ua?: string): number | undefined; getSitemaps(): string[]; diff --git a/test/Robots.js b/test/Robots.js index 666d9b8..6f979ba 100644 --- a/test/Robots.js +++ b/test/Robots.js @@ -861,4 +861,42 @@ describe('Robots', function () { testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed); }); + + it('should not be disallowed when wildcard is used in explicit mode', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser(url, contents); + + expect(robots.isDisallowed(url, userAgent, true)).to.equal(false) + }) + + it('should be disallowed when user agent equal robots rule in explicit mode', function () { + var contents = [ + 'User-agent: SomeBot', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser(url, contents); + + expect(robots.isDisallowed(url, userAgent, true)).to.equal(true) + }) + + it('should throw an error when user agent is not set in explicit mode', function () { + var contents = [ + 'User-agent: SomeBot', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var robots = robotsParser(url, contents); + + expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode") + }) }); From 6f402965e80d56dfe85d82066e9d92f7a41f05ea Mon Sep 17 00:00:00 2001 From: SimonC-Audigent Date: Mon, 23 Sep 2024 12:26:57 +0100 Subject: [PATCH 2/2] refactor code to use isExplictlyDisallowed --- README.md | 12 ++++++++--- Robots.js | 54 +++++++++++++++++--------------------------------- index.d.ts | 3 ++- test/Robots.js | 16 ++------------- 4 files changed, 31 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 47c3f32..6665594 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [ robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true -robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false +robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false robots.getCrawlDelay('Sams-Bot/1.0'); // 1 robots.getSitemaps(); // ['http://example.com/sitemap.xml'] robots.getPreferredHost(); // example.com @@ -55,15 +55,21 @@ Returns true if crawling the specified URL is allowed for the specified user-age This will return `undefined` if the URL isn't valid for this robots.txt. -### isDisallowed(url, [ua], [explicit]) +### isDisallowed(url, [ua]) **boolean or undefined** Returns true if crawling the specified URL is not allowed for the specified user-agent. -In explicit mode, user agents wildcards are discarded. This will return `undefined` if the URL isn't valid for this robots.txt. +### isExplicitlyDisallowed(url, ua) + +**boolean or undefined** + +Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded). + +This will return undefined if the URL is not valid for this robots.txt file. ### getMatchingLineNumber(url, [ua]) **number or undefined** diff --git a/Robots.js b/Robots.js index db26a93..9fb7cf5 100644 --- a/Robots.js +++ b/Robots.js @@ -397,7 +397,7 @@ Robots.prototype._getRule = function (url, ua, explicit) { * @return {boolean?} */ Robots.prototype.isAllowed = function (url, ua) { - var rule = this._getRule(url, ua); + var rule = this._getRule(url, ua, false); if (typeof rule === 'undefined') { return; @@ -421,54 +421,36 @@ Robots.prototype.isAllowed = function (url, ua) { * @return {number?} */ Robots.prototype.getMatchingLineNumber = function (url, ua) { - var rule = this._getRule(url, ua); + var rule = this._getRule(url, ua, false); return rule ? rule.lineNumber : -1; }; /** - * In standard mode, it returns the opposite of is allowed(). - * In explicit mode, it will return: - * - true if the the agent is explicitly disallowed (wildcard non included), - * - throws an error if the user agent is not specified, - * - and false otherwise. + * Returns the opposite of isAllowed() + * * @param {string} url - * @param {string} ua + * @param {string?} ua * @return {boolean} */ -Robots.prototype.isDisallowed = function (url, ua, explicit) { - if ((explicit === true) && (ua === undefined)) { - throw new Error("User Agent must be specified in explicit mode") - } - - var rule = this._getRule(url, ua, explicit); - if (typeof rule === 'undefined') { - return true; - } - return !(!rule || rule.allow); +Robots.prototype.isDisallowed = function (url, ua) { + return !this.isAllowed(url, ua); }; +/** + * Returns trues if explicitly disallowed + * for the specified user agent (User Agent wildcards are discarded). + * + * This will return undefined if the URL is not valid for this robots.txt file. + * @param {string} url + * @param {string} ua + * @return {boolean?} + */ Robots.prototype.isExplicitlyDisallowed = function(url, ua) { - var parsedUrl = parseUrl(url) || {}; - var userAgent = formatUserAgent(ua); - - // The base URL must match otherwise this robots.txt is not valid for it. - if ( - parsedUrl.protocol !== this._url.protocol || - parsedUrl.hostname !== this._url.hostname || - parsedUrl.port !== this._url.port - ) { - return; - } - - var rules = this._rules[userAgent] || []; - var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); - var rule = findRule(path, rules); - + var rule = this._getRule(url, ua, true); if (typeof rule === 'undefined') { - return; + return true; } - return !(!rule || rule.allow); } diff --git a/index.d.ts b/index.d.ts index 852ddec..0cf4313 100644 --- a/index.d.ts +++ b/index.d.ts @@ -2,7 +2,8 @@ declare module 'robots-parser'; interface Robot { isAllowed(url: string, ua?: string): boolean | undefined; - isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined; + isDisallowed(url: string, ua?: string): boolean | undefined; + isExplicitlyDisallowed(url: string, ua: string): boolean | undefined; getMatchingLineNumber(url: string, ua?: string): number; getCrawlDelay(ua?: string): number | undefined; getSitemaps(): string[]; diff --git a/test/Robots.js b/test/Robots.js index 6f979ba..f1575ae 100644 --- a/test/Robots.js +++ b/test/Robots.js @@ -872,7 +872,7 @@ describe('Robots', function () { var userAgent = 'SomeBot'; var robots = robotsParser(url, contents); - expect(robots.isDisallowed(url, userAgent, true)).to.equal(false) + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false) }) it('should be disallowed when user agent equal robots rule in explicit mode', function () { @@ -885,18 +885,6 @@ describe('Robots', function () { var userAgent = 'SomeBot'; var robots = robotsParser(url, contents); - expect(robots.isDisallowed(url, userAgent, true)).to.equal(true) + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true) }) - - it('should throw an error when user agent is not set in explicit mode', function () { - var contents = [ - 'User-agent: SomeBot', - 'Disallow: /', - ].join('\n') - - var url = 'https://www.example.com/hello' - var robots = robotsParser(url, contents); - - expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode") - }) });