Skip to content

Commit

Permalink
Made url matching better! Addresses #4
Browse files Browse the repository at this point in the history
The url matcher can now take a url of the formats

* https://domain.com
* http://domain.com
* http://www.domain.com
* http://www.sub.domain.com
* http://sub.domain.com
* http://sub.domain.com/

and a few more and match it to the key

'domain.com'

It does this by first removing the protocol part of the url.

It then takes out the www. part if it exists

It proceeds to remove the ending slash

It then checks if it can find the url thus far inside the dictionary.
This is done so that we can match a sub domain only instead of always
matching the full root domain. Example would be a company blog that
resides on a blogging service. The blogging service shouldn't be
penalised, but <domain>.blogginservice.com should.
  • Loading branch information
kiriappeee committed Sep 23, 2015
1 parent be22cf8 commit 74d1e59
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 11 deletions.
25 changes: 24 additions & 1 deletion core/matcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,33 @@ function doesUrlMatchBlackList(urlToMatch, dictionary){
if(urlToMatch.indexOf('http') == -1){
return false;
}
urlToMatch = urlToMatch.slice(urlToMatch.indexOf('://')+3);
if (urlToMatch.indexOf('www.') === 0){
urlToMatch = urlToMatch.slice(urlToMatch.indexOf('www.') + 4);
}
if (urlToMatch.indexOf('/') !== -1){
urlToMatch = urlToMatch.slice(0, urlToMatch.indexOf('/'));
}
console.log(urlToMatch);
if(dictionary.urls[urlToMatch]){
return true;
}
while ( true ){
if(urlToMatch.indexOf('.') === -1){
break;
}
if (urlToMatch.indexOf('.') !== urlToMatch.lastIndexOf('.')){
urlToMatch = urlToMatch.slice(urlToMatch.indexOf('.')+1);
}
if(urlToMatch.indexOf('.') === urlToMatch.lastIndexOf('.')){
break;
}
}
console.log(urlToMatch);
if(dictionary.urls[urlToMatch]){
return true;
}

return false;
}

module.exports.doesUrlMatchBlackList = doesUrlMatchBlackList;
29 changes: 19 additions & 10 deletions core/spec/MatcherSpec.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
describe("Matcher", function(){
matcher = require('../matcher.js');
var dictionary = {urls:
{"https://www.heroku.com/": {
"isDirect": false,
"parent": "https://salesforce.com/",
"source": "",
"description": "",
{
"heroku.com": {
"isDirect": false,
"parent": "https://salesforce.com/",
"source": "",
"description": "",
},
"https://www.salesforce.com/": {
"isDirect": true,
"parent": "",
"source": "",
"description": "",
"salesforce.com": {
"isDirect": true,
"parent": "",
"source": "",
"description": "",
}
}
};
Expand All @@ -23,4 +24,12 @@ describe("Matcher", function(){
expect(matcher.doesUrlMatchBlackList('https://www.heroku.com/', dictionary)).toBe(true);
expect(matcher.doesUrlMatchBlackList('https://www.salesforce.com/', dictionary)).toBe(true);
});
it("should return true without www", function(){
expect(matcher.doesUrlMatchBlackList('https://heroku.com/', dictionary)).toBe(true);
expect(matcher.doesUrlMatchBlackList('https://salesforce.com/', dictionary)).toBe(true);
});
it("should return true for subdomains", function(){
expect(matcher.doesUrlMatchBlackList('https://sub.heroku.com/', dictionary)).toBe(true);
expect(matcher.doesUrlMatchBlackList('http://www.sub.salesforce.com/', dictionary)).toBe(true);
});
});

0 comments on commit 74d1e59

Please sign in to comment.