Skip to content

Commit

Permalink
abot을 제거하고, 직접 HttpClient와 AngleSharp를 쓰도록 변경
Browse files Browse the repository at this point in the history
close #40 뽐뿌 크롤링 오류 수정
  • Loading branch information
elky84 committed Jul 5, 2024
1 parent 206cf59 commit b971f43
Show file tree
Hide file tree
Showing 15 changed files with 86 additions and 209 deletions.
3 changes: 0 additions & 3 deletions Cli/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ internal static class Program
{
private static async Task Main(string[] args)
{
var provider = CodePagesEncodingProvider.Instance;
Encoding.RegisterProvider(provider);

Log.Logger = new LoggerConfiguration()
.MinimumLevel.Warning()
.WriteTo.Console()
Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[![Website](https://img.shields.io/website-up-down-green-red/http/shields.io.svg?label=elky-essay)](https://elky84.github.io)
![Made with](https://img.shields.io/badge/made%20with-.NET7-brightgreen.svg)
![Made with](https://img.shields.io/badge/made%20with-.NET8-brightgreen.svg)
![Made with](https://img.shields.io/badge/made%20with-JavaScript-blue.svg)
![Made with](https://img.shields.io/badge/made%20with-MongoDB-red.svg)

Expand All @@ -16,8 +16,7 @@

# web-crawler

* .NET 7, ASP NET CORE 3를 기반으로 작성되었습니다.
* 웹 크롤러로는 [abot2](https://github.com/sjdirect/abot) 를 사용했습니다.
* .NET 8, ASP NET CORE 를 기반으로 작성되었습니다.
* 크롤링 대상은 Source로 등록되어야 합니다. [예시](https://github.com/elky84/web-crawler/blob/master/Http/source.http)
* Source로 등록된 게시판들은 테스트를 거쳐 크롤링 됨을 확인한 사이트와 게시판 들이지만, 규격이 달라져 추가적인 예외처리가 필요할 수 있습니다.
* 알림은 Discord, Slack을 지원합니다. Notification 데이터를, Source와 매핑 시켜서 해당 Source에 새 데이터가 갱신되면 알림이 날라오게 되어있습니다.
Expand Down
11 changes: 4 additions & 7 deletions WebCrawler/Crawler/ClienCrawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,21 @@
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using AngleSharp.Dom;
using WebCrawler.Models;

namespace WebCrawler.Crawler
{
public class ClienCrawler : CrawlerBase
public class ClienCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source)
: CrawlerBase(onCrawlDataDelegate, mongoDb, $"https://www.clien.net/service/board/{source.BoardId}", source)
{
public ClienCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) :
base(onCrawlDataDelegate, mongoDb, $"https://www.clien.net/service/board/{source.BoardId}", source)
{
}

protected override string UrlComposite(int page)
{
// 페이지가 0부터 시작함
return $"{UrlBase}?od=T32&po={page - 1}";
}

protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document)
protected override void OnPageCrawl(IDocument document)
{
var tdContent = document.QuerySelectorAll("div")
.Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("list_item") && x.ClassName.Contains("symph_row"))
Expand Down
114 changes: 27 additions & 87 deletions WebCrawler/Crawler/CrawlerBase.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
using Abot2.Core;
using Abot2.Crawler;
using Abot2.Poco;
using AngleSharp.Html.Parser;
using AngleSharp.Html.Parser;
using EzAspDotNet.Util;
using EzMongoDb.Util;
using MongoDB.Driver;
Expand All @@ -11,6 +8,7 @@
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
Expand All @@ -32,13 +30,9 @@ public abstract class CrawlerBase
private readonly MongoDbUtil<CrawlingData> _mongoDbCrawlingData;

private CrawlDataDelegate OnCrawlDataDelegate { get; set; }

private HtmlParser _parser = new();


private int _executing;

protected ConcurrentBag<CrawlingData> ConcurrentBag { get; } = [];


protected CrawlerBase(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, string urlBase, Source source)
{
if (mongoDb != null)
Expand All @@ -50,125 +44,71 @@ protected CrawlerBase(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mong
UrlBase = urlBase;
Source = source;
}

protected virtual CrawlConfiguration Config()
{
return new CrawlConfiguration
{
MaxConcurrentThreads = 10,
MaxPagesToCrawl = 1,
MaxPagesToCrawlPerDomain = 5,
MinRetryDelayInMilliseconds = 1000,
MinCrawlDelayPerDomainMilliSeconds = 5000,
IsForcedLinkParsingEnabled = true,
HttpProtocolVersion = HttpProtocolVersion.Version11,
UserAgentString = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
};
}

protected virtual PoliteWebCrawler Create()
{
var crawlerInstance = new PoliteWebCrawler(Config(),
null,
null,
null,
new PageRequester(Config(), new WebContentExtractor(), CreateHttpClient()),
null,
null,
null,
null);
crawlerInstance.PageCrawlStarting += ProcessPageCrawlStarting;
crawlerInstance.PageCrawlCompleted += ProcessPageCrawlCompleted;
crawlerInstance.PageCrawlDisallowed += PageCrawlDisallowed;
crawlerInstance.PageLinksCrawlDisallowed += PageLinksCrawlDisallowed;
return crawlerInstance;
}


private static HttpClient CreateHttpClient()
{
var client = new HttpClient();
client.DefaultRequestHeaders.Add("Accept-Charset", "utf-8");
client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");

client.Timeout = TimeSpan.FromSeconds(30);
client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json"));
return client;
}

public virtual async Task RunAsync()
{
var crawlerInstance = Create();

for (var page = Source.PageMin; page <= Source.PageMax; ++page)
{
await ExecuteAsync(crawlerInstance, page);
await ExecuteAsync(page);
Thread.Sleep(Source.Interval);
}
}

protected virtual bool CanTwice() => true;

protected async Task<bool> ExecuteAsync(PoliteWebCrawler crawler, int page)
protected async Task<bool> ExecuteAsync(int page)
{
if (!CanTwice() && 0 != Interlocked.Exchange(ref _executing, 1)) return false;
var builder = new UriBuilder(UrlComposite(page));
var crawlResult = await crawler.CrawlAsync(builder.Uri);
await Crawling(builder.Uri);
Interlocked.Exchange(ref _executing, 0);
return crawlResult.ErrorOccurred;

return true;
}

protected abstract string UrlComposite(int page);

private void ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
ConcurrentBag.Clear();
var pageToCrawl = e.PageToCrawl;
Log.Logger.Debug("About to crawl link {UriAbsoluteUri} which was found on page {ParentUriAbsoluteUri}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri);
}

private void ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
private async Task Crawling(Uri uri)
{
var crawledPage = e.CrawledPage;
if (crawledPage.HttpRequestException != null)
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

using var client = CreateHttpClient();
var response = await client.GetAsync(uri.AbsoluteUri);
if (response.StatusCode != HttpStatusCode.OK)
{
Log.Logger.Error("Crawl of page failed. <Url:{UriAbsoluteUri}> <Exception:{Message}>", crawledPage.Uri?.AbsoluteUri, crawledPage.HttpRequestException?.Message);
Log.Logger.Error("response failed. <Url:{UriAbsoluteUri}> <StatusCode:{StatusCode}>", uri.AbsoluteUri, response.StatusCode);
return;
}

if (crawledPage.HttpResponseMessage?.StatusCode != HttpStatusCode.OK)

var utf8String = await response.Content.ReadAsStringAsync();
if (string.IsNullOrEmpty(utf8String))
{
Log.Logger.Error("Crawl of page failed. <Url:{UriAbsoluteUri}> <StatusCode:{StatusCode}>", crawledPage.Uri?.AbsoluteUri, crawledPage.HttpResponseMessage?.StatusCode);
Log.Logger.Error("response content is null. <Url:{UriAbsoluteUri}>", uri.AbsoluteUri);
return;
}

if (string.IsNullOrEmpty(crawledPage.Content?.Text))
{
Log.Logger.Error("Crawl of page failed. <Url:{UriAbsoluteUri}> <Content:{Content}>", crawledPage.Uri?.AbsoluteUri, crawledPage.HttpResponseMessage?.Content);
return;
}
var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader());
var document = await context.OpenAsync(req => req.Content(utf8String));

Log.Logger.Debug("Crawl of page succeeded {UriAbsoluteUri}", crawledPage.Uri?.AbsoluteUri);

OnPageCrawl(crawledPage.AngleSharpHtmlDocument);
OnPageCrawl(document);
}

protected abstract void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document);
protected abstract void OnPageCrawl(AngleSharp.Dom.IDocument document);

protected virtual string UrlCompositeHref(string href)
{
return UrlBase.CutAndComposite("/", 0, 3, href);
}

private static void PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e)
{
var crawledPage = e.CrawledPage;
Log.Logger.Error("Did not crawl the links on page {UriAbsoluteUri} due to {EDisallowedReason}", crawledPage.Uri.AbsoluteUri, e.DisallowedReason);
}

private static void PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
{
var pageToCrawl = e.PageToCrawl;
Log.Logger.Error("Did not crawl page {UriAbsoluteUri} due to {EDisallowedReason}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason);
}

protected async Task<CrawlingData> OnCrawlData(CrawlingData crawlingData)
{
// 글자 뒤의 공백 날리기
Expand Down
49 changes: 11 additions & 38 deletions WebCrawler/Crawler/FmkoreaCrawler.cs
Original file line number Diff line number Diff line change
@@ -1,51 +1,24 @@
using Abot2.Crawler;
using Abot2.Poco;
using EzAspDotNet.Util;
using EzAspDotNet.Util;
using MongoDB.Driver;
using Serilog;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using AngleSharp.Dom;
using WebCrawler.Models;

namespace WebCrawler.Crawler
{
public class FmkoreaCrawler : CrawlerBase
public class FmkoreaCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source)
: CrawlerBase(onCrawlDataDelegate, mongoDb, $"https://www.fmkorea.com/index.php", source)
{
private static readonly Queue<PoliteWebCrawler> CrawlerQueue = new();

public FmkoreaCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) :
base(onCrawlDataDelegate, mongoDb, $"https://www.fmkorea.com/index.php", source)
{
foreach (var _ in Enumerable.Range(0, 5))
CrawlerQueue.Enqueue(base.Create());
}

protected override CrawlConfiguration Config()
{
var config = base.Config();
config.MaxPagesToCrawlPerDomain = 1;
config.MaxRobotsDotTextCrawlDelayInSeconds = 60;
config.MaxConcurrentThreads = 1;
config.MinRetryDelayInMilliseconds = 60000;
config.MinCrawlDelayPerDomainMilliSeconds = 60000;
return config;
}

protected override PoliteWebCrawler Create()
{
var crawler = CrawlerQueue.Dequeue();
CrawlerQueue.Enqueue(crawler);
return crawler;
}

protected override string UrlComposite(int page)
{
return $"{UrlBase}?mid={Source.BoardId}&page={page}";
}

protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document)
protected override void OnPageCrawl(IDocument document)
{
var thContent = document.QuerySelectorAll("thead tr th").Select(x => x.TextContent.Trim()).ToArray();
if (thContent.Any())
Expand All @@ -60,7 +33,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document)

protected override bool CanTwice() => false;

private void OnPageCrawlTable(AngleSharp.Html.Dom.IHtmlDocument document, string[] thContent)
private void OnPageCrawlTable(IDocument document, string[] thContent)
{
var tdAll = document.QuerySelectorAll("tbody tr td")
.Where(x => !string.IsNullOrEmpty(x.ClassName) && !x.ClassName.Contains("notice"));
Expand Down Expand Up @@ -88,7 +61,7 @@ private void OnPageCrawlTable(AngleSharp.Html.Dom.IHtmlDocument document, string
var href = UrlCompositeHref(tdHref[n]);
ConcurrentBag.Add(OnCrawlData(new CrawlingData
_ = OnCrawlData(new CrawlingData
{
Type = Source.Type,
BoardId = Source.BoardId,
Expand All @@ -101,11 +74,11 @@ private void OnPageCrawlTable(AngleSharp.Html.Dom.IHtmlDocument document, string
DateTime = date,
Href = href,
SourceId = Source.Id
}).Result);
});
});
}

private void OnPageCrawlList(AngleSharp.Html.Dom.IHtmlDocument document)
private void OnPageCrawlList(IDocument document)
{
var tdContent = document.QuerySelectorAll("ul li div")
.Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("li"))
Expand Down Expand Up @@ -161,7 +134,7 @@ private void OnPageCrawlList(AngleSharp.Html.Dom.IHtmlDocument document)
var href = UrlCompositeHref(hrefs[0]);
ConcurrentBag.Add(OnCrawlData(new CrawlingData
_ = OnCrawlData(new CrawlingData
{
Type = Source.Type,
BoardId = Source.BoardId,
Expand All @@ -173,7 +146,7 @@ private void OnPageCrawlList(AngleSharp.Html.Dom.IHtmlDocument document)
DateTime = date,
Href = href,
SourceId = Source.Id
}).Result);
});
});
}
}
Expand Down
Loading

0 comments on commit b971f43

Please sign in to comment.