Skip to content

Commit

Permalink
refactor code to use isExplictlyDisallowed
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonC-Audigent committed Sep 23, 2024
1 parent d5f8b28 commit a704b51
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 54 deletions.
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false
robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
robots.getPreferredHost(); // example.com
Expand All @@ -55,15 +55,21 @@ Returns true if crawling the specified URL is allowed for the specified user-age

This will return `undefined` if the URL isn't valid for this robots.txt.

### isDisallowed(url, [ua], [explicit])
### isDisallowed(url, [ua])

**boolean or undefined**

Returns true if crawling the specified URL is not allowed for the specified user-agent.
In explicit mode, user agents wildcards are discarded.

This will return `undefined` if the URL isn't valid for this robots.txt.

### isExplicitlyDisallowed(url, ua)

**boolean or undefined**

Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded), false if not allowed.

This will return undefined if the URL is not valid for this robots.txt file.
### getMatchingLineNumber(url, [ua])

**number or undefined**
Expand Down
55 changes: 19 additions & 36 deletions Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ Robots.prototype._getRule = function (url, ua, explicit) {
* @return {boolean?}
*/
Robots.prototype.isAllowed = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

if (typeof rule === 'undefined') {
return;
Expand All @@ -421,54 +421,37 @@ Robots.prototype.isAllowed = function (url, ua) {
* @return {number?}
*/
Robots.prototype.getMatchingLineNumber = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

return rule ? rule.lineNumber : -1;
};

/**
* In standard mode, it returns the opposite of is allowed().
* In explicit mode, it will return:
* - true if the the agent is explicitly disallowed (wildcard non included),
* - throws an error if the user agent is not specified,
* - and false otherwise.
* Returns the opposite of isAllowed()
*
* @param {string} url
* @param {string} ua
* @param {string?} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua, explicit) {
if ((explicit === true) && (ua === undefined)) {
throw new Error("User Agent must be specified in explicit mode")
}

var rule = this._getRule(url, ua, explicit);
if (typeof rule === 'undefined') {
return true;
}
return !(!rule || rule.allow);
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
};

Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua);

// The base URL must match otherwise this robots.txt is not valid for it.
if (
parsedUrl.protocol !== this._url.protocol ||
parsedUrl.hostname !== this._url.hostname ||
parsedUrl.port !== this._url.port
) {
return;
}

var rules = this._rules[userAgent] || [];
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);
/**
* Returns trues if explicitly disallowed
* for the specified user agent (User Agent wildcards are discarded),
* false if not allowed.
* This will return undefined if the URL is not valid for this robots.txt file.
* @param {string} url
* @param {string} ua
* @return {boolean?}
*/
Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
var rule = this._getRule(url, ua, true);
if (typeof rule === 'undefined') {
return;
return undefined;
}

return !(!rule || rule.allow);
}

Expand Down
3 changes: 2 additions & 1 deletion index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ declare module 'robots-parser';

interface Robot {
isAllowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined;
isDisallowed(url: string, ua?: string): boolean | undefined;
isExplicitlyDisallowed(url: string, ua: string): boolean | undefined;
getMatchingLineNumber(url: string, ua?: string): number;
getCrawlDelay(ua?: string): number | undefined;
getSitemaps(): string[];
Expand Down
16 changes: 2 additions & 14 deletions test/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,7 @@ describe('Robots', function () {
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isDisallowed(url, userAgent, true)).to.equal(false)
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
})

it('should be disallowed when user agent equal robots rule in explicit mode', function () {
Expand All @@ -885,18 +885,6 @@ describe('Robots', function () {
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isDisallowed(url, userAgent, true)).to.equal(true)
expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
})

it('should throw an error when user agent is not set in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var robots = robotsParser(url, contents);

expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode")
})
});

0 comments on commit a704b51

Please sign in to comment.