Made url matching better! Addresses #4

The url matcher can now take a url of the formats * https://domain.com * http://domain.com * http://www.domain.com * http://www.sub.domain.com * http://sub.domain.com * http://sub.domain.com/ and a few more and match it to the key 'domain.com' It does this by first removing the protocol part of the url. It then takes out the www. part if it exists It proceeds to remove the ending slash It then checks if it can find the url thus far inside the dictionary. This is done so that we can match a sub domain only instead of always matching the full root domain. Example would be a company blog that resides on a blogging service. The blogging service shouldn't be penalised, but <domain>.blogginservice.com should.
spartakode · Sep 23, 2015 · 74d1e59 · 74d1e59
1 parent be22cf8
commit 74d1e59
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 11 deletions.
diff --git a/core/matcher.js b/core/matcher.js
@@ -2,10 +2,33 @@ function doesUrlMatchBlackList(urlToMatch, dictionary){
     if(urlToMatch.indexOf('http') == -1){
         return false;
     }
+    urlToMatch = urlToMatch.slice(urlToMatch.indexOf('://')+3);
+    if (urlToMatch.indexOf('www.') === 0){
+        urlToMatch = urlToMatch.slice(urlToMatch.indexOf('www.') + 4);
+    }
+    if (urlToMatch.indexOf('/') !== -1){
+        urlToMatch = urlToMatch.slice(0, urlToMatch.indexOf('/'));
+    }
+    console.log(urlToMatch);
+    if(dictionary.urls[urlToMatch]){
+        return true;
+    }
+    while ( true ){
+        if(urlToMatch.indexOf('.') === -1){
+            break;
+        }
+        if (urlToMatch.indexOf('.') !== urlToMatch.lastIndexOf('.')){
+            urlToMatch = urlToMatch.slice(urlToMatch.indexOf('.')+1);
+        }
+        if(urlToMatch.indexOf('.') === urlToMatch.lastIndexOf('.')){
+            break;
+        }
+    }
+    console.log(urlToMatch);
     if(dictionary.urls[urlToMatch]){
         return true;
     }
-
+    return false;
 }
 
 module.exports.doesUrlMatchBlackList = doesUrlMatchBlackList;
diff --git a/core/spec/MatcherSpec.js b/core/spec/MatcherSpec.js
@@ -1,17 +1,18 @@
 describe("Matcher", function(){
     matcher = require('../matcher.js');
     var dictionary = {urls:
-        {"https://www.heroku.com/": {
-            "isDirect": false,
-            "parent": "https://salesforce.com/",
-            "source": "",
-            "description": "",
+        {
+            "heroku.com": {
+                "isDirect": false,
+                "parent": "https://salesforce.com/",
+                "source": "",
+                "description": "",
             },
-        "https://www.salesforce.com/": {
-            "isDirect": true,
-            "parent": "",
-            "source": "",
-            "description": "",
+            "salesforce.com": {
+                "isDirect": true,
+                "parent": "",
+                "source": "",
+                "description": "",
             }
         }
     };
@@ -23,4 +24,12 @@ describe("Matcher", function(){
         expect(matcher.doesUrlMatchBlackList('https://www.heroku.com/', dictionary)).toBe(true);
         expect(matcher.doesUrlMatchBlackList('https://www.salesforce.com/', dictionary)).toBe(true);
     });
+    it("should return true without www", function(){
+        expect(matcher.doesUrlMatchBlackList('https://heroku.com/', dictionary)).toBe(true);
+        expect(matcher.doesUrlMatchBlackList('https://salesforce.com/', dictionary)).toBe(true);
+    });
+    it("should return true for subdomains", function(){
+        expect(matcher.doesUrlMatchBlackList('https://sub.heroku.com/', dictionary)).toBe(true);
+        expect(matcher.doesUrlMatchBlackList('http://www.sub.salesforce.com/', dictionary)).toBe(true);
+    });
 });