Skip to content

Commit

Permalink
Fixed parent/child directory parsing and unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
nazuke committed May 22, 2018
1 parent 4ea8ebb commit 30fdb4c
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 115 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -628,13 +628,63 @@ public static int FindUrlDepth ( string Url )

}

/**************************************************************************/

public static string DetermineStartingDirectory ( string Url )
{

Uri StartUri = null;
string Path = "/";
string StartUriPort = "";
string StartingUrl = null;

try
{

StartUri = new Uri( Url );

if( StartUri.Port > 0 )
{
StartUriPort = string.Format( ":{0}", StartUri.Port );
}

Path = StartUri.AbsolutePath;

}
catch( UriFormatException ex )
{
DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ), true );
}
catch( Exception ex )
{
DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ), true );
}


if( StartUri != null )
{

Path = Regex.Replace( Path, "/[^/]*$", "/", RegexOptions.IgnoreCase );

if( Path.Length == 0 )
{
Path = "/";
}

StartingUrl = string.Join(
"",
StartUri.Scheme,
"://",
StartUri.Host,
StartUriPort,
Path
);

}

return ( StartingUrl );

}

/**************************************************************************/

Expand Down Expand Up @@ -673,6 +723,7 @@ public static bool IsWithinParentDirectory ( string StartUrl, string Url )
|| ( CurrentUri.Scheme.ToLower() == "https" ) )
{

string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: StartUrl );
string Path = CurrentUri.AbsolutePath;
string CurrentUriString;
int ParentStartingDirectoryLength;
Expand All @@ -694,12 +745,12 @@ public static bool IsWithinParentDirectory ( string StartUrl, string Url )
Path
);

ParentStartingDirectoryLength = StartUrl.Length;
ParentStartingDirectoryLength = StartingUrl.Length;
CurrentUriStringLength = CurrentUriString.Length;

if( ParentStartingDirectoryLength >= CurrentUriStringLength )
{
if( StartUrl.StartsWith( CurrentUriString, StringComparison.Ordinal ) )
if( StartingUrl.StartsWith( CurrentUriString, StringComparison.Ordinal ) )
{
IsWithin = true;
}
Expand Down Expand Up @@ -751,6 +802,7 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url )
|| ( CurrentUri.Scheme.ToLower() == "https" ) )
{

string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: StartUrl );
string Path = CurrentUri.AbsolutePath;
string CurrentUriString;
int ChildStartingDirectoryLength;
Expand All @@ -772,13 +824,13 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url )
Path
);

ChildStartingDirectoryLength = StartUrl.Length;
ChildStartingDirectoryLength = StartingUrl.Length;
CurrentUriStringLength = CurrentUriString.Length;

if( CurrentUriStringLength >= ChildStartingDirectoryLength )
{

if( CurrentUriString.StartsWith( StartUrl, StringComparison.Ordinal ) )
if( CurrentUriString.StartsWith( StartingUrl, StringComparison.Ordinal ) )
{
IsWithin = true;
}
Expand All @@ -792,37 +844,7 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url )
return ( IsWithin );

}
































/**************************************************************************/

public static string CleanUrlCss ( string CssProperty )
Expand Down
96 changes: 18 additions & 78 deletions SEOMacroscopeSeriesOne/src/MacroscopeTasks/MacroscopeJobMaster.cs
Original file line number Diff line number Diff line change
Expand Up @@ -372,27 +372,25 @@ public MacroscopeDataExtractorXpaths GetDataExtractorXpaths ()
public bool Execute ()
{

this.DebugMsg( string.Format( "Start URL: {0}", this.StartUrl ) );
this.DebugMsg( string.Format( "Start URL: {0}", this.GetStartUrl() ) );

//this.LogEntry( string.Format( "Executing with Start URL: {0}", this.StartUrl ) );

this.StartUrl = MacroscopeHttpUrlUtils.SanitizeUrl( Url: this.StartUrl );
this.SetStartUrl( Url: MacroscopeHttpUrlUtils.SanitizeUrl( Url: this.GetStartUrl() ) );

this.DocCollection.SetStartUrl( Url: this.StartUrl );

this.DetermineStartingDirectory();
this.DocCollection.SetStartUrl( Url: this.GetStartUrl() );

this.SetThreadsStop( Stopped: false );

this.AllowedHosts.AddFromUrl( Url: this.StartUrl );
this.AllowedHosts.AddFromUrl( Url: this.GetStartUrl() );

if( !this.PeekUrlQueue() )
{

{ // Add robots.txt URL to queue
if( MacroscopePreferencesManager.GetFollowRobotsProtocol() )
{
string RobotsUrl = MacroscopeRobots.GenerateRobotUrl( Url: this.StartUrl );
string RobotsUrl = MacroscopeRobots.GenerateRobotUrl( Url: this.GetStartUrl() );
if( !string.IsNullOrEmpty( RobotsUrl ) )
{
this.AddUrlQueueItem( Url: RobotsUrl );
Expand All @@ -406,7 +404,7 @@ public bool Execute ()
MacroscopeSitemapPaths SitemapPaths = new MacroscopeSitemapPaths();
foreach( string SitemapPath in SitemapPaths.IterateSitemapPaths() )
{
string SitemapUrl = MacroscopeSitemapPaths.GenerateSitemapUrl( Url: this.StartUrl, SitemapPath: SitemapPath );
string SitemapUrl = MacroscopeSitemapPaths.GenerateSitemapUrl( Url: this.GetStartUrl(), SitemapPath: SitemapPath );
if( !string.IsNullOrEmpty( SitemapUrl ) )
{
this.AddUrlQueueItem( Url: SitemapUrl );
Expand All @@ -418,17 +416,17 @@ public bool Execute ()
{ // Add humans.txt URL to queue
if( MacroscopePreferencesManager.GetProbeHumansText() )
{
string HumansUrl = MacroscopeHumans.GenerateHumansUrl( Url: this.StartUrl );
string HumansUrl = MacroscopeHumans.GenerateHumansUrl( Url: this.GetStartUrl() );
if( !string.IsNullOrEmpty( HumansUrl ) )
{
this.AddUrlQueueItem( Url: HumansUrl );
}
}
}

this.IncludeExcludeUrls.AddExplicitIncludeUrl( Url: this.StartUrl );
this.IncludeExcludeUrls.AddExplicitIncludeUrl( Url: this.GetStartUrl() );

this.AddUrlQueueItem( Url: this.StartUrl );
this.AddUrlQueueItem( Url: this.GetStartUrl() );

foreach( MacroscopeDocument msDoc in this.GetDocCollection().IterateDocuments() )
{
Expand All @@ -437,9 +435,9 @@ public bool Execute ()

}

this.ProbeRobotsFile( Url: this.StartUrl );
this.ProbeRobotsFile( Url: this.GetStartUrl() );

this.SetCrawlDelay( Url: this.StartUrl );
this.SetCrawlDelay( Url: this.GetStartUrl() );

this.SpawnWorkers();

Expand All @@ -450,7 +448,7 @@ public bool Execute ()
this.TaskController.ICallbackScanComplete();
}

this.AddUpdateDisplayQueue( Url: this.StartUrl );
this.AddUpdateDisplayQueue( Url: this.GetStartUrl() );

return ( true );

Expand Down Expand Up @@ -1089,6 +1087,7 @@ private void ResetLink ( MacroscopeDocument msDoc )
public string SetStartUrl ( string Url )
{
this.StartUrl = Url;
this.DetermineStartingDirectory();
return ( this.StartUrl );
}

Expand All @@ -1103,7 +1102,7 @@ public string GetStartUrl ()

public string GetStartUriHostAndPort ()
{
Uri StartUri = new Uri( this.StartUrl );
Uri StartUri = new Uri( this.GetStartUrl() );
string StartUriHostAndPort = null;
if( StartUri != null )
{
Expand Down Expand Up @@ -1148,70 +1147,11 @@ private void IncPagesFound ()

/** Crawl Parent / Child Directories **************************************/

public void DetermineStartingDirectory ()
private void DetermineStartingDirectory ()
{

Uri StartUri = null;
string Path = "/";
string StartUriPort = "";

try
{

StartUri = new Uri( this.GetStartUrl() );

if( StartUri.Port > 0 )
{
StartUriPort = string.Format( ":{0}", StartUri.Port );
}

Path = StartUri.AbsolutePath;

}
catch( UriFormatException ex )
{
this.DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ) );
}
catch( Exception ex )
{
this.DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ) );
}


if( StartUri != null )
{

Path = Regex.Replace( Path, "/[^/]*$", "/", RegexOptions.IgnoreCase );

if( Path.Length == 0 )
{
Path = "/";
}

this.SetParentStartingDirectory(
Url: string.Join(
"",
StartUri.Scheme,
"://",
StartUri.Host,
StartUriPort,
Path
)
);

this.SetChildStartingDirectory(
Url: string.Join(
"",
StartUri.Scheme,
"://",
StartUri.Host,
StartUriPort,
Path
)
);

}

string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: this.GetStartUrl() );
this.SetParentStartingDirectory( Url: StartingUrl );
this.SetChildStartingDirectory( Url: StartingUrl );
}

/** -------------------------------------------------------------------- **/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ public void TestJobMasterStartUrl ()

JobMaster.SetStartUrl( Url: StartUrl );

JobMaster.DetermineStartingDirectory();

Assert.AreEqual( StartUrl, JobMaster.GetStartUrl(), string.Format( "FAIL: {0}", StartUrl ) );

}
Expand Down

0 comments on commit 30fdb4c

Please sign in to comment.