From 30fdb4ca3846bc0392e72f4e4cf0fe2830e1a39c Mon Sep 17 00:00:00 2001 From: Jason Holland Date: Tue, 22 May 2018 14:57:49 +0900 Subject: [PATCH] Fixed parent/child directory parsing and unit tests --- .../HTTP/MacroscopeHttpUrlUtils.cs | 92 +++++++++++------- .../MacroscopeTasks/MacroscopeJobMaster.cs | 96 ++++--------------- .../t/TestMacroscopeJobMaster.cs | 2 - 3 files changed, 75 insertions(+), 115 deletions(-) diff --git a/SEOMacroscopeSeriesOne/src/MacroscopeStandards/HTTP/MacroscopeHttpUrlUtils.cs b/SEOMacroscopeSeriesOne/src/MacroscopeStandards/HTTP/MacroscopeHttpUrlUtils.cs index e8c44af..172612c 100644 --- a/SEOMacroscopeSeriesOne/src/MacroscopeStandards/HTTP/MacroscopeHttpUrlUtils.cs +++ b/SEOMacroscopeSeriesOne/src/MacroscopeStandards/HTTP/MacroscopeHttpUrlUtils.cs @@ -628,13 +628,63 @@ public static int FindUrlDepth ( string Url ) } + /**************************************************************************/ + + public static string DetermineStartingDirectory ( string Url ) + { + + Uri StartUri = null; + string Path = "/"; + string StartUriPort = ""; + string StartingUrl = null; + try + { + StartUri = new Uri( Url ); + if( StartUri.Port > 0 ) + { + StartUriPort = string.Format( ":{0}", StartUri.Port ); + } + Path = StartUri.AbsolutePath; + } + catch( UriFormatException ex ) + { + DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ), true ); + } + catch( Exception ex ) + { + DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ), true ); + } + if( StartUri != null ) + { + + Path = Regex.Replace( Path, "/[^/]*$", "/", RegexOptions.IgnoreCase ); + + if( Path.Length == 0 ) + { + Path = "/"; + } + + StartingUrl = string.Join( + "", + StartUri.Scheme, + "://", + StartUri.Host, + StartUriPort, + Path + ); + + } + + return ( StartingUrl ); + + } /**************************************************************************/ @@ -673,6 +723,7 @@ public static bool IsWithinParentDirectory ( string StartUrl, string Url ) || ( CurrentUri.Scheme.ToLower() == "https" ) ) { + string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: StartUrl ); string Path = CurrentUri.AbsolutePath; string CurrentUriString; int ParentStartingDirectoryLength; @@ -694,12 +745,12 @@ public static bool IsWithinParentDirectory ( string StartUrl, string Url ) Path ); - ParentStartingDirectoryLength = StartUrl.Length; + ParentStartingDirectoryLength = StartingUrl.Length; CurrentUriStringLength = CurrentUriString.Length; if( ParentStartingDirectoryLength >= CurrentUriStringLength ) { - if( StartUrl.StartsWith( CurrentUriString, StringComparison.Ordinal ) ) + if( StartingUrl.StartsWith( CurrentUriString, StringComparison.Ordinal ) ) { IsWithin = true; } @@ -751,6 +802,7 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url ) || ( CurrentUri.Scheme.ToLower() == "https" ) ) { + string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: StartUrl ); string Path = CurrentUri.AbsolutePath; string CurrentUriString; int ChildStartingDirectoryLength; @@ -772,13 +824,13 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url ) Path ); - ChildStartingDirectoryLength = StartUrl.Length; + ChildStartingDirectoryLength = StartingUrl.Length; CurrentUriStringLength = CurrentUriString.Length; if( CurrentUriStringLength >= ChildStartingDirectoryLength ) { - if( CurrentUriString.StartsWith( StartUrl, StringComparison.Ordinal ) ) + if( CurrentUriString.StartsWith( StartingUrl, StringComparison.Ordinal ) ) { IsWithin = true; } @@ -792,37 +844,7 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url ) return ( IsWithin ); } - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + /**************************************************************************/ public static string CleanUrlCss ( string CssProperty ) diff --git a/SEOMacroscopeSeriesOne/src/MacroscopeTasks/MacroscopeJobMaster.cs b/SEOMacroscopeSeriesOne/src/MacroscopeTasks/MacroscopeJobMaster.cs index 027b8d6..5b34e9b 100644 --- a/SEOMacroscopeSeriesOne/src/MacroscopeTasks/MacroscopeJobMaster.cs +++ b/SEOMacroscopeSeriesOne/src/MacroscopeTasks/MacroscopeJobMaster.cs @@ -372,19 +372,17 @@ public MacroscopeDataExtractorXpaths GetDataExtractorXpaths () public bool Execute () { - this.DebugMsg( string.Format( "Start URL: {0}", this.StartUrl ) ); + this.DebugMsg( string.Format( "Start URL: {0}", this.GetStartUrl() ) ); //this.LogEntry( string.Format( "Executing with Start URL: {0}", this.StartUrl ) ); - this.StartUrl = MacroscopeHttpUrlUtils.SanitizeUrl( Url: this.StartUrl ); + this.SetStartUrl( Url: MacroscopeHttpUrlUtils.SanitizeUrl( Url: this.GetStartUrl() ) ); - this.DocCollection.SetStartUrl( Url: this.StartUrl ); - - this.DetermineStartingDirectory(); + this.DocCollection.SetStartUrl( Url: this.GetStartUrl() ); this.SetThreadsStop( Stopped: false ); - this.AllowedHosts.AddFromUrl( Url: this.StartUrl ); + this.AllowedHosts.AddFromUrl( Url: this.GetStartUrl() ); if( !this.PeekUrlQueue() ) { @@ -392,7 +390,7 @@ public bool Execute () { // Add robots.txt URL to queue if( MacroscopePreferencesManager.GetFollowRobotsProtocol() ) { - string RobotsUrl = MacroscopeRobots.GenerateRobotUrl( Url: this.StartUrl ); + string RobotsUrl = MacroscopeRobots.GenerateRobotUrl( Url: this.GetStartUrl() ); if( !string.IsNullOrEmpty( RobotsUrl ) ) { this.AddUrlQueueItem( Url: RobotsUrl ); @@ -406,7 +404,7 @@ public bool Execute () MacroscopeSitemapPaths SitemapPaths = new MacroscopeSitemapPaths(); foreach( string SitemapPath in SitemapPaths.IterateSitemapPaths() ) { - string SitemapUrl = MacroscopeSitemapPaths.GenerateSitemapUrl( Url: this.StartUrl, SitemapPath: SitemapPath ); + string SitemapUrl = MacroscopeSitemapPaths.GenerateSitemapUrl( Url: this.GetStartUrl(), SitemapPath: SitemapPath ); if( !string.IsNullOrEmpty( SitemapUrl ) ) { this.AddUrlQueueItem( Url: SitemapUrl ); @@ -418,7 +416,7 @@ public bool Execute () { // Add humans.txt URL to queue if( MacroscopePreferencesManager.GetProbeHumansText() ) { - string HumansUrl = MacroscopeHumans.GenerateHumansUrl( Url: this.StartUrl ); + string HumansUrl = MacroscopeHumans.GenerateHumansUrl( Url: this.GetStartUrl() ); if( !string.IsNullOrEmpty( HumansUrl ) ) { this.AddUrlQueueItem( Url: HumansUrl ); @@ -426,9 +424,9 @@ public bool Execute () } } - this.IncludeExcludeUrls.AddExplicitIncludeUrl( Url: this.StartUrl ); + this.IncludeExcludeUrls.AddExplicitIncludeUrl( Url: this.GetStartUrl() ); - this.AddUrlQueueItem( Url: this.StartUrl ); + this.AddUrlQueueItem( Url: this.GetStartUrl() ); foreach( MacroscopeDocument msDoc in this.GetDocCollection().IterateDocuments() ) { @@ -437,9 +435,9 @@ public bool Execute () } - this.ProbeRobotsFile( Url: this.StartUrl ); + this.ProbeRobotsFile( Url: this.GetStartUrl() ); - this.SetCrawlDelay( Url: this.StartUrl ); + this.SetCrawlDelay( Url: this.GetStartUrl() ); this.SpawnWorkers(); @@ -450,7 +448,7 @@ public bool Execute () this.TaskController.ICallbackScanComplete(); } - this.AddUpdateDisplayQueue( Url: this.StartUrl ); + this.AddUpdateDisplayQueue( Url: this.GetStartUrl() ); return ( true ); @@ -1089,6 +1087,7 @@ private void ResetLink ( MacroscopeDocument msDoc ) public string SetStartUrl ( string Url ) { this.StartUrl = Url; + this.DetermineStartingDirectory(); return ( this.StartUrl ); } @@ -1103,7 +1102,7 @@ public string GetStartUrl () public string GetStartUriHostAndPort () { - Uri StartUri = new Uri( this.StartUrl ); + Uri StartUri = new Uri( this.GetStartUrl() ); string StartUriHostAndPort = null; if( StartUri != null ) { @@ -1148,70 +1147,11 @@ private void IncPagesFound () /** Crawl Parent / Child Directories **************************************/ - public void DetermineStartingDirectory () + private void DetermineStartingDirectory () { - - Uri StartUri = null; - string Path = "/"; - string StartUriPort = ""; - - try - { - - StartUri = new Uri( this.GetStartUrl() ); - - if( StartUri.Port > 0 ) - { - StartUriPort = string.Format( ":{0}", StartUri.Port ); - } - - Path = StartUri.AbsolutePath; - - } - catch( UriFormatException ex ) - { - this.DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ) ); - } - catch( Exception ex ) - { - this.DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ) ); - } - - - if( StartUri != null ) - { - - Path = Regex.Replace( Path, "/[^/]*$", "/", RegexOptions.IgnoreCase ); - - if( Path.Length == 0 ) - { - Path = "/"; - } - - this.SetParentStartingDirectory( - Url: string.Join( - "", - StartUri.Scheme, - "://", - StartUri.Host, - StartUriPort, - Path - ) - ); - - this.SetChildStartingDirectory( - Url: string.Join( - "", - StartUri.Scheme, - "://", - StartUri.Host, - StartUriPort, - Path - ) - ); - - } - + string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: this.GetStartUrl() ); + this.SetParentStartingDirectory( Url: StartingUrl ); + this.SetChildStartingDirectory( Url: StartingUrl ); } /** -------------------------------------------------------------------- **/ diff --git a/SEOMacroscopeSeriesOne/src/MacroscopeTasks/t/TestMacroscopeJobMaster.cs b/SEOMacroscopeSeriesOne/src/MacroscopeTasks/t/TestMacroscopeJobMaster.cs index 603ec6a..2d534a4 100644 --- a/SEOMacroscopeSeriesOne/src/MacroscopeTasks/t/TestMacroscopeJobMaster.cs +++ b/SEOMacroscopeSeriesOne/src/MacroscopeTasks/t/TestMacroscopeJobMaster.cs @@ -51,8 +51,6 @@ public void TestJobMasterStartUrl () JobMaster.SetStartUrl( Url: StartUrl ); - JobMaster.DetermineStartingDirectory(); - Assert.AreEqual( StartUrl, JobMaster.GetStartUrl(), string.Format( "FAIL: {0}", StartUrl ) ); }