Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add explicit disallow feature #36

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The parser currently supports:

- User-agent:
- Allow:
- Disallow:
- Disallow (with explicit mode support):
- Sitemap:
- Crawl-delay:
- Host:
Expand Down Expand Up @@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
robots.getPreferredHost(); // example.com
Expand All @@ -54,11 +55,12 @@ Returns true if crawling the specified URL is allowed for the specified user-age

This will return `undefined` if the URL isn't valid for this robots.txt.

### isDisallowed(url, [ua])
### isDisallowed(url, [ua], [explicit])

**boolean or undefined**

Returns true if crawling the specified URL is not allowed for the specified user-agent.
In explicit mode, user agents wildcards are discarded.

This will return `undefined` if the URL isn't valid for this robots.txt.

Expand Down
52 changes: 46 additions & 6 deletions Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) {
this._preferredHost = url;
};

Robots.prototype._getRule = function (url, ua) {
Robots.prototype._getRule = function (url, ua, explicit) {
samclarke marked this conversation as resolved.
Show resolved Hide resolved
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua || '*');

Expand All @@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) {
return;
}

var rules = this._rules[userAgent] || this._rules['*'] || [];
var rules = this._rules[userAgent];
if (!explicit) {
rules = rules || this._rules['*']
}
rules = rules || []

var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);

Expand Down Expand Up @@ -422,16 +427,51 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) {
};

/**
* Returns the opposite of isAllowed()
*
* In standard mode, it returns the opposite of is allowed().
* In explicit mode, it will return:
* - true if the the agent is explicitly disallowed (wildcard non included),
* - throws an error if the user agent is not specified,
* - and false otherwise.
* @param {string} url
* @param {string} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
Robots.prototype.isDisallowed = function (url, ua, explicit) {
if ((explicit === true) && (ua === undefined)) {
throw new Error("User Agent must be specified in explicit mode")
}

var rule = this._getRule(url, ua, explicit);
if (typeof rule === 'undefined') {
return true;
}
return !(!rule || rule.allow);
};

Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
samclarke marked this conversation as resolved.
Show resolved Hide resolved
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua);

// The base URL must match otherwise this robots.txt is not valid for it.
if (
parsedUrl.protocol !== this._url.protocol ||
parsedUrl.hostname !== this._url.hostname ||
parsedUrl.port !== this._url.port
) {
return;
}

var rules = this._rules[userAgent] || [];
var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);

if (typeof rule === 'undefined') {
return;
}

return !(!rule || rule.allow);
}

/**
* Gets the crawl delay if there is one.
*
Expand Down
2 changes: 1 addition & 1 deletion index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ declare module 'robots-parser';

interface Robot {
isAllowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined;
getMatchingLineNumber(url: string, ua?: string): number;
getCrawlDelay(ua?: string): number | undefined;
getSitemaps(): string[];
Expand Down
38 changes: 38 additions & 0 deletions test/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -861,4 +861,42 @@ describe('Robots', function () {

testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
});

it('should not be disallowed when wildcard is used in explicit mode', function () {
var contents = [
'User-agent: *',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isDisallowed(url, userAgent, true)).to.equal(false)
})

it('should be disallowed when user agent equal robots rule in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isDisallowed(url, userAgent, true)).to.equal(true)
})

it('should throw an error when user agent is not set in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var robots = robotsParser(url, contents);

expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode")
})
});
Loading