Skip to content

Commit

Permalink
refactor code to use isExplictlyDisallowed
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonC-Audigent committed Sep 23, 2024
1 parent d5f8b28 commit 6f40296
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 54 deletions.
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false
robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
robots.getPreferredHost(); // example.com
Expand All @@ -55,15 +55,21 @@ Returns true if crawling the specified URL is allowed for the specified user-age

This will return `undefined` if the URL isn't valid for this robots.txt.

### isDisallowed(url, [ua], [explicit])
### isDisallowed(url, [ua])

**boolean or undefined**

Returns true if crawling the specified URL is not allowed for the specified user-agent.
In explicit mode, user agents wildcards are discarded.

This will return `undefined` if the URL isn't valid for this robots.txt.

### isExplicitlyDisallowed(url, ua)

**boolean or undefined**

Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded).

This will return undefined if the URL is not valid for this robots.txt file.
### getMatchingLineNumber(url, [ua])

**number or undefined**
Expand Down
54 changes: 18 additions & 36 deletions Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ Robots.prototype._getRule = function (url, ua, explicit) {
* @return {boolean?}
*/
Robots.prototype.isAllowed = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

if (typeof rule === 'undefined') {
return;
Expand All @@ -421,54 +421,36 @@ Robots.prototype.isAllowed = function (url, ua) {
* @return {number?}
*/
Robots.prototype.getMatchingLineNumber = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

return rule ? rule.lineNumber : -1;
};

/**
* In standard mode, it returns the opposite of is allowed().
* In explicit mode, it will return:
* - true if the the agent is explicitly disallowed (wildcard non included),
* - throws an error if the user agent is not specified,
* - and false otherwise.
* Returns the opposite of isAllowed()
*
* @param {string} url
* @param {string} ua
* @param {string?} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua, explicit) {
if ((explicit === true) && (ua === undefined)) {
throw new Error("User Agent must be specified in explicit mode")
}

var rule = this._getRule(url, ua, explicit);
if (typeof rule === 'undefined') {
return true;
}
return !(!rule || rule.allow);
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
};

/**
* Returns trues if explicitly disallowed
* for the specified user agent (User Agent wildcards are discarded).
*
* This will return undefined if the URL is not valid for this robots.txt file.
* @param {string} url
* @param {string} ua
* @return {boolean?}
*/
Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua);

// The base URL must match otherwise this robots.txt is not valid for it.
if (
parsedUrl.protocol !== this._url.protocol ||
parsedUrl.hostname !== this._url.hostname ||
parsedUrl.port !== this._url.port
) {
return;
}

var rules = this._rules[userAgent] || [];
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);

var rule = this._getRule(url, ua, true);
if (typeof rule === 'undefined') {
return;
return true;
}

return !(!rule || rule.allow);
}

Expand Down
3 changes: 2 additions & 1 deletion index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ declare module 'robots-parser';

interface Robot {
isAllowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined;
isDisallowed(url: string, ua?: string): boolean | undefined;
isExplicitlyDisallowed(url: string, ua: string): boolean | undefined;
getMatchingLineNumber(url: string, ua?: string): number;
getCrawlDelay(ua?: string): number | undefined;
getSitemaps(): string[];
Expand Down
16 changes: 2 additions & 14 deletions test/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,7 @@ describe('Robots', function () {
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isDisallowed(url, userAgent, true)).to.equal(false)
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
})

it('should be disallowed when user agent equal robots rule in explicit mode', function () {
Expand All @@ -885,18 +885,6 @@ describe('Robots', function () {
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isDisallowed(url, userAgent, true)).to.equal(true)
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
})

it('should throw an error when user agent is not set in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var robots = robotsParser(url, contents);

expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode")
})
});

0 comments on commit 6f40296

Please sign in to comment.