Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add explicit disallow feature #36

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The parser currently supports:

- User-agent:
- Allow:
- Disallow:
- Disallow (with explicit mode support):
- Sitemap:
- Crawl-delay:
- Host:
Expand Down Expand Up @@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [
robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true
robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true
robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true
robots.isExplicitlyDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // false
robots.getCrawlDelay('Sams-Bot/1.0'); // 1
robots.getSitemaps(); // ['http://example.com/sitemap.xml']
robots.getPreferredHost(); // example.com
Expand All @@ -62,6 +63,13 @@ Returns true if crawling the specified URL is not allowed for the specified user

This will return `undefined` if the URL isn't valid for this robots.txt.

### isExplicitlyDisallowed(url, ua)

**boolean or undefined**

Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded).

This will return undefined if the URL is not valid for this robots.txt file.
### getMatchingLineNumber(url, [ua])

**number or undefined**
Expand Down
32 changes: 27 additions & 5 deletions Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) {
this._preferredHost = url;
};

Robots.prototype._getRule = function (url, ua) {
Robots.prototype._getRule = function (url, ua, explicit) {
samclarke marked this conversation as resolved.
Show resolved Hide resolved
var parsedUrl = parseUrl(url) || {};
var userAgent = formatUserAgent(ua || '*');

Expand All @@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) {
return;
}

var rules = this._rules[userAgent] || this._rules['*'] || [];
var rules = this._rules[userAgent];
if (!explicit) {
rules = rules || this._rules['*']
}
rules = rules || []

var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search);
var rule = findRule(path, rules);

Expand All @@ -392,7 +397,7 @@ Robots.prototype._getRule = function (url, ua) {
* @return {boolean?}
*/
Robots.prototype.isAllowed = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

if (typeof rule === 'undefined') {
return;
Expand All @@ -416,7 +421,7 @@ Robots.prototype.isAllowed = function (url, ua) {
* @return {number?}
*/
Robots.prototype.getMatchingLineNumber = function (url, ua) {
var rule = this._getRule(url, ua);
var rule = this._getRule(url, ua, false);

return rule ? rule.lineNumber : -1;
};
Expand All @@ -425,13 +430,30 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) {
* Returns the opposite of isAllowed()
*
* @param {string} url
* @param {string} ua
* @param {string?} ua
* @return {boolean}
*/
Robots.prototype.isDisallowed = function (url, ua) {
return !this.isAllowed(url, ua);
};

/**
* Returns trues if explicitly disallowed
* for the specified user agent (User Agent wildcards are discarded).
*
* This will return undefined if the URL is not valid for this robots.txt file.
* @param {string} url
* @param {string} ua
* @return {boolean?}
*/
Robots.prototype.isExplicitlyDisallowed = function(url, ua) {
samclarke marked this conversation as resolved.
Show resolved Hide resolved
var rule = this._getRule(url, ua, true);
if (typeof rule === 'undefined') {
return true;
}
return !(!rule || rule.allow);
}

/**
* Gets the crawl delay if there is one.
*
Expand Down
1 change: 1 addition & 0 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ declare module 'robots-parser';
interface Robot {
isAllowed(url: string, ua?: string): boolean | undefined;
isDisallowed(url: string, ua?: string): boolean | undefined;
isExplicitlyDisallowed(url: string, ua: string): boolean | undefined;
getMatchingLineNumber(url: string, ua?: string): number;
getCrawlDelay(ua?: string): number | undefined;
getSitemaps(): string[];
Expand Down
26 changes: 26 additions & 0 deletions test/Robots.js
Original file line number Diff line number Diff line change
Expand Up @@ -861,4 +861,30 @@ describe('Robots', function () {

testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed);
});

it('should not be disallowed when wildcard is used in explicit mode', function () {
var contents = [
'User-agent: *',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false)
})

it('should be disallowed when user agent equal robots rule in explicit mode', function () {
var contents = [
'User-agent: SomeBot',
'Disallow: /',
].join('\n')

var url = 'https://www.example.com/hello'
var userAgent = 'SomeBot';
var robots = robotsParser(url, contents);

expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true)
})
});
Loading