From 74d1e595e96ba6f8a0f1d7379111f70d3afae857 Mon Sep 17 00:00:00 2001 From: Adnan Issadeen Date: Wed, 23 Sep 2015 17:39:45 +0530 Subject: [PATCH] Made url matching better! Addresses #4 The url matcher can now take a url of the formats * https://domain.com * http://domain.com * http://www.domain.com * http://www.sub.domain.com * http://sub.domain.com * http://sub.domain.com/ and a few more and match it to the key 'domain.com' It does this by first removing the protocol part of the url. It then takes out the www. part if it exists It proceeds to remove the ending slash It then checks if it can find the url thus far inside the dictionary. This is done so that we can match a sub domain only instead of always matching the full root domain. Example would be a company blog that resides on a blogging service. The blogging service shouldn't be penalised, but .blogginservice.com should. --- core/matcher.js | 25 ++++++++++++++++++++++++- core/spec/MatcherSpec.js | 29 +++++++++++++++++++---------- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/core/matcher.js b/core/matcher.js index 702436f..8a6bedd 100644 --- a/core/matcher.js +++ b/core/matcher.js @@ -2,10 +2,33 @@ function doesUrlMatchBlackList(urlToMatch, dictionary){ if(urlToMatch.indexOf('http') == -1){ return false; } + urlToMatch = urlToMatch.slice(urlToMatch.indexOf('://')+3); + if (urlToMatch.indexOf('www.') === 0){ + urlToMatch = urlToMatch.slice(urlToMatch.indexOf('www.') + 4); + } + if (urlToMatch.indexOf('/') !== -1){ + urlToMatch = urlToMatch.slice(0, urlToMatch.indexOf('/')); + } + console.log(urlToMatch); + if(dictionary.urls[urlToMatch]){ + return true; + } + while ( true ){ + if(urlToMatch.indexOf('.') === -1){ + break; + } + if (urlToMatch.indexOf('.') !== urlToMatch.lastIndexOf('.')){ + urlToMatch = urlToMatch.slice(urlToMatch.indexOf('.')+1); + } + if(urlToMatch.indexOf('.') === urlToMatch.lastIndexOf('.')){ + break; + } + } + console.log(urlToMatch); if(dictionary.urls[urlToMatch]){ return true; } - + return false; } module.exports.doesUrlMatchBlackList = doesUrlMatchBlackList; diff --git a/core/spec/MatcherSpec.js b/core/spec/MatcherSpec.js index d97af46..05a80e7 100644 --- a/core/spec/MatcherSpec.js +++ b/core/spec/MatcherSpec.js @@ -1,17 +1,18 @@ describe("Matcher", function(){ matcher = require('../matcher.js'); var dictionary = {urls: - {"https://www.heroku.com/": { - "isDirect": false, - "parent": "https://salesforce.com/", - "source": "", - "description": "", + { + "heroku.com": { + "isDirect": false, + "parent": "https://salesforce.com/", + "source": "", + "description": "", }, - "https://www.salesforce.com/": { - "isDirect": true, - "parent": "", - "source": "", - "description": "", + "salesforce.com": { + "isDirect": true, + "parent": "", + "source": "", + "description": "", } } }; @@ -23,4 +24,12 @@ describe("Matcher", function(){ expect(matcher.doesUrlMatchBlackList('https://www.heroku.com/', dictionary)).toBe(true); expect(matcher.doesUrlMatchBlackList('https://www.salesforce.com/', dictionary)).toBe(true); }); + it("should return true without www", function(){ + expect(matcher.doesUrlMatchBlackList('https://heroku.com/', dictionary)).toBe(true); + expect(matcher.doesUrlMatchBlackList('https://salesforce.com/', dictionary)).toBe(true); + }); + it("should return true for subdomains", function(){ + expect(matcher.doesUrlMatchBlackList('https://sub.heroku.com/', dictionary)).toBe(true); + expect(matcher.doesUrlMatchBlackList('http://www.sub.salesforce.com/', dictionary)).toBe(true); + }); });