From b971f43956a280f9e9dc02d2ac62b0050972ee37 Mon Sep 17 00:00:00 2001 From: elky Date: Fri, 5 Jul 2024 23:11:13 +0900 Subject: [PATCH] =?UTF-8?q?abot=EC=9D=84=20=EC=A0=9C=EA=B1=B0=ED=95=98?= =?UTF-8?q?=EA=B3=A0,=20=EC=A7=81=EC=A0=91=20HttpClient=EC=99=80=20AngleSh?= =?UTF-8?q?arp=EB=A5=BC=20=EC=93=B0=EB=8F=84=EB=A1=9D=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close #40 뽐뿌 크롤링 오류 수정 --- Cli/Program.cs | 3 - README.md | 5 +- WebCrawler/Crawler/ClienCrawler.cs | 11 +- WebCrawler/Crawler/CrawlerBase.cs | 114 +++++-------------- WebCrawler/Crawler/FmkoreaCrawler.cs | 49 ++------ WebCrawler/Crawler/HumorUnivCrawler.cs | 32 +----- WebCrawler/Crawler/InvenNewsCrawler.cs | 7 +- WebCrawler/Crawler/ItcmCrawler.cs | 3 +- WebCrawler/Crawler/PpomppuCrawler.cs | 29 ++--- WebCrawler/Crawler/RuliwebCrawler.cs | 6 +- WebCrawler/Crawler/SlrclubCrawler.cs | 13 +-- WebCrawler/Crawler/SlrclubPageInfoCrawler.cs | 9 +- WebCrawler/Crawler/TodayhumorCrawler.cs | 7 +- WebCrawler/WebCrawler.csproj | 2 +- web-crawler.sln.DotSettings | 5 - 15 files changed, 86 insertions(+), 209 deletions(-) delete mode 100644 web-crawler.sln.DotSettings diff --git a/Cli/Program.cs b/Cli/Program.cs index 36ac54f..5042e69 100644 --- a/Cli/Program.cs +++ b/Cli/Program.cs @@ -11,9 +11,6 @@ internal static class Program { private static async Task Main(string[] args) { - var provider = CodePagesEncodingProvider.Instance; - Encoding.RegisterProvider(provider); - Log.Logger = new LoggerConfiguration() .MinimumLevel.Warning() .WriteTo.Console() diff --git a/README.md b/README.md index 96c735a..5933a57 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [![Website](https://img.shields.io/website-up-down-green-red/http/shields.io.svg?label=elky-essay)](https://elky84.github.io) -![Made with](https://img.shields.io/badge/made%20with-.NET7-brightgreen.svg) +![Made with](https://img.shields.io/badge/made%20with-.NET8-brightgreen.svg) ![Made with](https://img.shields.io/badge/made%20with-JavaScript-blue.svg) ![Made with](https://img.shields.io/badge/made%20with-MongoDB-red.svg) @@ -16,8 +16,7 @@ # web-crawler -* .NET 7, ASP NET CORE 3를 기반으로 작성되었습니다. -* 웹 크롤러로는 [abot2](https://github.com/sjdirect/abot) 를 사용했습니다. +* .NET 8, ASP NET CORE 를 기반으로 작성되었습니다. * 크롤링 대상은 Source로 등록되어야 합니다. [예시](https://github.com/elky84/web-crawler/blob/master/Http/source.http) * Source로 등록된 게시판들은 테스트를 거쳐 크롤링 됨을 확인한 사이트와 게시판 들이지만, 규격이 달라져 추가적인 예외처리가 필요할 수 있습니다. * 알림은 Discord, Slack을 지원합니다. Notification 데이터를, Source와 매핑 시켜서 해당 Source에 새 데이터가 갱신되면 알림이 날라오게 되어있습니다. diff --git a/WebCrawler/Crawler/ClienCrawler.cs b/WebCrawler/Crawler/ClienCrawler.cs index 714edd4..4f95bff 100644 --- a/WebCrawler/Crawler/ClienCrawler.cs +++ b/WebCrawler/Crawler/ClienCrawler.cs @@ -4,24 +4,21 @@ using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; +using AngleSharp.Dom; using WebCrawler.Models; namespace WebCrawler.Crawler { - public class ClienCrawler : CrawlerBase + public class ClienCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) + : CrawlerBase(onCrawlDataDelegate, mongoDb, $"https://www.clien.net/service/board/{source.BoardId}", source) { - public ClienCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) : - base(onCrawlDataDelegate, mongoDb, $"https://www.clien.net/service/board/{source.BoardId}", source) - { - } - protected override string UrlComposite(int page) { // 페이지가 0부터 시작함 return $"{UrlBase}?od=T32&po={page - 1}"; } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var tdContent = document.QuerySelectorAll("div") .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("list_item") && x.ClassName.Contains("symph_row")) diff --git a/WebCrawler/Crawler/CrawlerBase.cs b/WebCrawler/Crawler/CrawlerBase.cs index b1aa9d9..347f50f 100644 --- a/WebCrawler/Crawler/CrawlerBase.cs +++ b/WebCrawler/Crawler/CrawlerBase.cs @@ -1,7 +1,4 @@ -using Abot2.Core; -using Abot2.Crawler; -using Abot2.Poco; -using AngleSharp.Html.Parser; +using AngleSharp.Html.Parser; using EzAspDotNet.Util; using EzMongoDb.Util; using MongoDB.Driver; @@ -11,6 +8,7 @@ using System.Linq; using System.Net; using System.Net.Http; +using System.Net.Http.Headers; using System.Runtime.InteropServices; using System.Text; using System.Threading; @@ -32,13 +30,9 @@ public abstract class CrawlerBase private readonly MongoDbUtil _mongoDbCrawlingData; private CrawlDataDelegate OnCrawlDataDelegate { get; set; } - - private HtmlParser _parser = new(); - + private int _executing; - - protected ConcurrentBag ConcurrentBag { get; } = []; - + protected CrawlerBase(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, string urlBase, Source source) { if (mongoDb != null) @@ -50,125 +44,71 @@ protected CrawlerBase(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mong UrlBase = urlBase; Source = source; } - - protected virtual CrawlConfiguration Config() - { - return new CrawlConfiguration - { - MaxConcurrentThreads = 10, - MaxPagesToCrawl = 1, - MaxPagesToCrawlPerDomain = 5, - MinRetryDelayInMilliseconds = 1000, - MinCrawlDelayPerDomainMilliSeconds = 5000, - IsForcedLinkParsingEnabled = true, - HttpProtocolVersion = HttpProtocolVersion.Version11, - UserAgentString = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - }; - } - - protected virtual PoliteWebCrawler Create() - { - var crawlerInstance = new PoliteWebCrawler(Config(), - null, - null, - null, - new PageRequester(Config(), new WebContentExtractor(), CreateHttpClient()), - null, - null, - null, - null); - crawlerInstance.PageCrawlStarting += ProcessPageCrawlStarting; - crawlerInstance.PageCrawlCompleted += ProcessPageCrawlCompleted; - crawlerInstance.PageCrawlDisallowed += PageCrawlDisallowed; - crawlerInstance.PageLinksCrawlDisallowed += PageLinksCrawlDisallowed; - return crawlerInstance; - } - private static HttpClient CreateHttpClient() { var client = new HttpClient(); - client.DefaultRequestHeaders.Add("Accept-Charset", "utf-8"); + client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); + + client.Timeout = TimeSpan.FromSeconds(30); + client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json")); return client; } public virtual async Task RunAsync() { - var crawlerInstance = Create(); - for (var page = Source.PageMin; page <= Source.PageMax; ++page) { - await ExecuteAsync(crawlerInstance, page); + await ExecuteAsync(page); Thread.Sleep(Source.Interval); } } protected virtual bool CanTwice() => true; - protected async Task ExecuteAsync(PoliteWebCrawler crawler, int page) + protected async Task ExecuteAsync(int page) { if (!CanTwice() && 0 != Interlocked.Exchange(ref _executing, 1)) return false; var builder = new UriBuilder(UrlComposite(page)); - var crawlResult = await crawler.CrawlAsync(builder.Uri); + await Crawling(builder.Uri); Interlocked.Exchange(ref _executing, 0); - return crawlResult.ErrorOccurred; - + return true; } protected abstract string UrlComposite(int page); - private void ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e) - { - ConcurrentBag.Clear(); - var pageToCrawl = e.PageToCrawl; - Log.Logger.Debug("About to crawl link {UriAbsoluteUri} which was found on page {ParentUriAbsoluteUri}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri); - } - - private void ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) + private async Task Crawling(Uri uri) { - var crawledPage = e.CrawledPage; - if (crawledPage.HttpRequestException != null) + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + + using var client = CreateHttpClient(); + var response = await client.GetAsync(uri.AbsoluteUri); + if (response.StatusCode != HttpStatusCode.OK) { - Log.Logger.Error("Crawl of page failed. ", crawledPage.Uri?.AbsoluteUri, crawledPage.HttpRequestException?.Message); + Log.Logger.Error("response failed. ", uri.AbsoluteUri, response.StatusCode); return; } - - if (crawledPage.HttpResponseMessage?.StatusCode != HttpStatusCode.OK) + + var utf8String = await response.Content.ReadAsStringAsync(); + if (string.IsNullOrEmpty(utf8String)) { - Log.Logger.Error("Crawl of page failed. ", crawledPage.Uri?.AbsoluteUri, crawledPage.HttpResponseMessage?.StatusCode); + Log.Logger.Error("response content is null. ", uri.AbsoluteUri); return; } - if (string.IsNullOrEmpty(crawledPage.Content?.Text)) - { - Log.Logger.Error("Crawl of page failed. ", crawledPage.Uri?.AbsoluteUri, crawledPage.HttpResponseMessage?.Content); - return; - } + var context = BrowsingContext.New(Configuration.Default.WithDefaultLoader()); + var document = await context.OpenAsync(req => req.Content(utf8String)); - Log.Logger.Debug("Crawl of page succeeded {UriAbsoluteUri}", crawledPage.Uri?.AbsoluteUri); - - OnPageCrawl(crawledPage.AngleSharpHtmlDocument); + OnPageCrawl(document); } - protected abstract void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document); + protected abstract void OnPageCrawl(AngleSharp.Dom.IDocument document); protected virtual string UrlCompositeHref(string href) { return UrlBase.CutAndComposite("/", 0, 3, href); } - private static void PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e) - { - var crawledPage = e.CrawledPage; - Log.Logger.Error("Did not crawl the links on page {UriAbsoluteUri} due to {EDisallowedReason}", crawledPage.Uri.AbsoluteUri, e.DisallowedReason); - } - - private static void PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e) - { - var pageToCrawl = e.PageToCrawl; - Log.Logger.Error("Did not crawl page {UriAbsoluteUri} due to {EDisallowedReason}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason); - } - protected async Task OnCrawlData(CrawlingData crawlingData) { // 글자 뒤의 공백 날리기 diff --git a/WebCrawler/Crawler/FmkoreaCrawler.cs b/WebCrawler/Crawler/FmkoreaCrawler.cs index 664edd5..6c067d6 100644 --- a/WebCrawler/Crawler/FmkoreaCrawler.cs +++ b/WebCrawler/Crawler/FmkoreaCrawler.cs @@ -1,51 +1,24 @@ -using Abot2.Crawler; -using Abot2.Poco; -using EzAspDotNet.Util; +using EzAspDotNet.Util; using MongoDB.Driver; using Serilog; using System; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; +using AngleSharp.Dom; using WebCrawler.Models; namespace WebCrawler.Crawler { - public class FmkoreaCrawler : CrawlerBase + public class FmkoreaCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) + : CrawlerBase(onCrawlDataDelegate, mongoDb, $"https://www.fmkorea.com/index.php", source) { - private static readonly Queue CrawlerQueue = new(); - - public FmkoreaCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) : - base(onCrawlDataDelegate, mongoDb, $"https://www.fmkorea.com/index.php", source) - { - foreach (var _ in Enumerable.Range(0, 5)) - CrawlerQueue.Enqueue(base.Create()); - } - - protected override CrawlConfiguration Config() - { - var config = base.Config(); - config.MaxPagesToCrawlPerDomain = 1; - config.MaxRobotsDotTextCrawlDelayInSeconds = 60; - config.MaxConcurrentThreads = 1; - config.MinRetryDelayInMilliseconds = 60000; - config.MinCrawlDelayPerDomainMilliSeconds = 60000; - return config; - } - - protected override PoliteWebCrawler Create() - { - var crawler = CrawlerQueue.Dequeue(); - CrawlerQueue.Enqueue(crawler); - return crawler; - } - protected override string UrlComposite(int page) { return $"{UrlBase}?mid={Source.BoardId}&page={page}"; } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var thContent = document.QuerySelectorAll("thead tr th").Select(x => x.TextContent.Trim()).ToArray(); if (thContent.Any()) @@ -60,7 +33,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) protected override bool CanTwice() => false; - private void OnPageCrawlTable(AngleSharp.Html.Dom.IHtmlDocument document, string[] thContent) + private void OnPageCrawlTable(IDocument document, string[] thContent) { var tdAll = document.QuerySelectorAll("tbody tr td") .Where(x => !string.IsNullOrEmpty(x.ClassName) && !x.ClassName.Contains("notice")); @@ -88,7 +61,7 @@ private void OnPageCrawlTable(AngleSharp.Html.Dom.IHtmlDocument document, string var href = UrlCompositeHref(tdHref[n]); - ConcurrentBag.Add(OnCrawlData(new CrawlingData + _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, @@ -101,11 +74,11 @@ private void OnPageCrawlTable(AngleSharp.Html.Dom.IHtmlDocument document, string DateTime = date, Href = href, SourceId = Source.Id - }).Result); + }); }); } - private void OnPageCrawlList(AngleSharp.Html.Dom.IHtmlDocument document) + private void OnPageCrawlList(IDocument document) { var tdContent = document.QuerySelectorAll("ul li div") .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("li")) @@ -161,7 +134,7 @@ private void OnPageCrawlList(AngleSharp.Html.Dom.IHtmlDocument document) var href = UrlCompositeHref(hrefs[0]); - ConcurrentBag.Add(OnCrawlData(new CrawlingData + _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, @@ -173,7 +146,7 @@ private void OnPageCrawlList(AngleSharp.Html.Dom.IHtmlDocument document) DateTime = date, Href = href, SourceId = Source.Id - }).Result); + }); }); } } diff --git a/WebCrawler/Crawler/HumorUnivCrawler.cs b/WebCrawler/Crawler/HumorUnivCrawler.cs index 9c1c02f..d51a428 100644 --- a/WebCrawler/Crawler/HumorUnivCrawler.cs +++ b/WebCrawler/Crawler/HumorUnivCrawler.cs @@ -1,34 +1,21 @@ -using Abot2.Crawler; -using AngleSharp; +using AngleSharp; using EzAspDotNet.Util; using MongoDB.Driver; using Serilog; using System; -using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; +using AngleSharp.Dom; using WebCrawler.Models; namespace WebCrawler.Crawler { - public class HumorUnivCrawler : CrawlerBase + public class HumorUnivCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) + : CrawlerBase(onCrawlDataDelegate, mongoDb, $"http://web.humoruniv.com/board/humor/list.html?table=", source) { - private static readonly Queue CrawlerQueue = new(); - - public HumorUnivCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) : - base(onCrawlDataDelegate, mongoDb, $"http://web.humoruniv.com/board/humor/list.html?table=", source) - { - foreach (var _ in Enumerable.Range(0, 5)) - CrawlerQueue.Enqueue(base.Create()); - } - protected override string UrlComposite(int page) { - if (page <= 1) - { - return $"{UrlBase}{Source.BoardId}"; - } - return $"{UrlBase}{Source.BoardId}&pg={page - 1}"; + return page <= 1 ? $"{UrlBase}{Source.BoardId}" : $"{UrlBase}{Source.BoardId}&pg={page - 1}"; } protected override string UrlCompositeHref(string href) @@ -36,14 +23,7 @@ protected override string UrlCompositeHref(string href) return UrlBase.CutAndComposite("/", 0, 5, "/" + href); } - protected override PoliteWebCrawler Create() - { - var crawler = CrawlerQueue.Dequeue(); - CrawlerQueue.Enqueue(crawler); - return crawler; - } - - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { if (document.Head.QuerySelector("meta")?.GetAttribute("http-equiv") == "refresh") { diff --git a/WebCrawler/Crawler/InvenNewsCrawler.cs b/WebCrawler/Crawler/InvenNewsCrawler.cs index ee51484..8800b71 100644 --- a/WebCrawler/Crawler/InvenNewsCrawler.cs +++ b/WebCrawler/Crawler/InvenNewsCrawler.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; +using AngleSharp.Dom; using WebCrawler.Models; namespace WebCrawler.Crawler @@ -16,7 +17,7 @@ protected override string UrlComposite(int page) return $"{UrlBase}?{Source.BoardId}&page={page}"; } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var tdContents = document.QuerySelectorAll("tbody tr") .Select(x => @@ -51,7 +52,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) var href = hrefs[0]; - ConcurrentBag.Add(OnCrawlData(new CrawlingData + _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, @@ -63,7 +64,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) DateTime = date, Href = href, SourceId = Source.Id - }).Result); + }); }); } } diff --git a/WebCrawler/Crawler/ItcmCrawler.cs b/WebCrawler/Crawler/ItcmCrawler.cs index a06ca6c..cf86db4 100644 --- a/WebCrawler/Crawler/ItcmCrawler.cs +++ b/WebCrawler/Crawler/ItcmCrawler.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; +using AngleSharp.Dom; using WebCrawler.Models; namespace WebCrawler.Crawler @@ -16,7 +17,7 @@ protected override string UrlComposite(int page) return $"{UrlBase}{Source.BoardId}&page={page}"; } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var tdContents = document.QuerySelectorAll("tbody tr") .Where(x => string.IsNullOrEmpty(x.ClassName) || x.ClassName != "notice") diff --git a/WebCrawler/Crawler/PpomppuCrawler.cs b/WebCrawler/Crawler/PpomppuCrawler.cs index 59761e5..558e2c1 100644 --- a/WebCrawler/Crawler/PpomppuCrawler.cs +++ b/WebCrawler/Crawler/PpomppuCrawler.cs @@ -11,13 +11,9 @@ namespace WebCrawler.Crawler { - public class PpomppuCrawler : CrawlerBase + public class PpomppuCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) + : CrawlerBase(onCrawlDataDelegate, mongoDb, $"https://www.ppomppu.co.kr/zboard/zboard.php", source) { - public PpomppuCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) : - base(onCrawlDataDelegate, mongoDb, $"https://www.ppomppu.co.kr/zboard/zboard.php", source) - { - } - protected override string UrlComposite(int page) { return $"{UrlBase}?id={Source.BoardId}&page={page}"; @@ -28,7 +24,7 @@ protected override string UrlCompositeHref(string href) return UrlBase.CutAndComposite("/", 0, 4, href); } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var cultureInfo = (CultureInfo)Thread.CurrentThread.CurrentCulture.Clone(); var calendar = cultureInfo.Calendar; @@ -36,14 +32,14 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) cultureInfo.DateTimeFormat.Calendar = calendar; var thContent = document.QuerySelectorAll("tbody tr") - .Where(x => x.Id == "headNotice") - .Select(x => x.QuerySelectorAll("td").Where(x => x.ClassName == "list_tspace")) + .Where(x => x.Id == "headNotice" || x.ClassName == "title_bg") + .Select(x => x.QuerySelectorAll("span, font")) .SelectMany(x => x.Select(y => y.TextContent.Trim())) .ToArray(); - + var tdContent = document.QuerySelectorAll("tbody tr") - .Where(x => x.ClassName == "baseList") - .Select(x => x.QuerySelectorAll("td").Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.Contains("list_vspace"))) + .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.StartsWith("baseList")) + .Select(x => x.QuerySelectorAll("td").Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.StartsWith("baseList-space"))) .SelectMany(x => x.Select(y => { var text = y.TextContent.Trim(); @@ -61,10 +57,9 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) .ToArray(); var tdHref = document.QuerySelectorAll("tbody tr") - .Where(x => x.ClassName == "baseList") + .Where(x => !string.IsNullOrEmpty(x.ClassName) && x.ClassName.StartsWith("baseList")) .Select(x => x.QuerySelectorAll("td a")) - .SelectMany(x => x.Where(y => y.QuerySelector("font") != null) - .Select(y => y.GetAttribute("href"))) + .SelectMany(x => x.Select(y => y.GetAttribute("href"))) .Where(x => x != "#") .ToArray(); @@ -95,7 +90,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) var href = UrlCompositeHref("/" + tdHref[n]); - ConcurrentBag.Add(OnCrawlData(new CrawlingData + _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, @@ -108,7 +103,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) DateTime = date, Href = href, SourceId = Source.Id - }).Result); + }); }); } } diff --git a/WebCrawler/Crawler/RuliwebCrawler.cs b/WebCrawler/Crawler/RuliwebCrawler.cs index 05a3c58..8f49cea 100644 --- a/WebCrawler/Crawler/RuliwebCrawler.cs +++ b/WebCrawler/Crawler/RuliwebCrawler.cs @@ -20,7 +20,7 @@ protected override string UrlComposite(int page) return $"{UrlBase}?page={page}"; } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var cultureInfo = (CultureInfo)Thread.CurrentThread.CurrentCulture.Clone(); var calendar = cultureInfo.Calendar; @@ -79,7 +79,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) var href = tdHref[n].GetAttribute("href"); - ConcurrentBag.Add(OnCrawlData(new CrawlingData + _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, @@ -92,7 +92,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) DateTime = date.GetValueOrDefault(DateTime.Now), Href = href, SourceId = Source.Id - }).Result); + }); }); } else diff --git a/WebCrawler/Crawler/SlrclubCrawler.cs b/WebCrawler/Crawler/SlrclubCrawler.cs index e06eea9..969087c 100644 --- a/WebCrawler/Crawler/SlrclubCrawler.cs +++ b/WebCrawler/Crawler/SlrclubCrawler.cs @@ -5,6 +5,7 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; +using AngleSharp.Dom; using WebCrawler.Models; namespace WebCrawler.Crawler @@ -21,8 +22,6 @@ protected override string UrlComposite(int page) public override async Task RunAsync() { - var crawlerInstance = Create(); - var pageInfoCrawler = new SlrclubPageInfoCrawler(null, null, Source); await pageInfoCrawler.RunAsync(); @@ -39,12 +38,12 @@ public override async Task RunAsync() for (var page = Source.PageMin; page <= Source.PageMax; ++page) { - await ExecuteAsync(crawlerInstance, page); + await ExecuteAsync(page); Thread.Sleep(Source.Interval); } } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var thContent = document.QuerySelectorAll("thead tr th") .Select(x => x.TextContent.Trim()).ToArray(); @@ -58,7 +57,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) .Select(x => x.QuerySelector("a").GetAttribute("href")) .ToArray(); - if (!thContent.Any() || !tdContent.Any()) + if (thContent.Length == 0 || tdContent.Length == 0) { Log.Error("Parsing Failed DOM. Not has thContent or tdContent {UrlComposite}", UrlComposite(1)); return; @@ -76,7 +75,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) var href = UrlCompositeHref(tdHref[n]); - ConcurrentBag.Add(OnCrawlData(new CrawlingData + _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, @@ -89,7 +88,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) DateTime = date, Href = href, SourceId = Source.Id - }).Result); + }); }); } } diff --git a/WebCrawler/Crawler/SlrclubPageInfoCrawler.cs b/WebCrawler/Crawler/SlrclubPageInfoCrawler.cs index 8da732a..3c159e0 100644 --- a/WebCrawler/Crawler/SlrclubPageInfoCrawler.cs +++ b/WebCrawler/Crawler/SlrclubPageInfoCrawler.cs @@ -2,6 +2,7 @@ using MongoDB.Driver; using System.Linq; using System.Threading.Tasks; +using AngleSharp.Dom; using WebCrawler.Models; namespace WebCrawler.Crawler @@ -9,7 +10,7 @@ namespace WebCrawler.Crawler public class SlrclubPageInfoCrawler(CrawlDataDelegate onCrawlDataDelegate, IMongoDatabase mongoDb, Source source) : CrawlerBase(onCrawlDataDelegate, mongoDb, $"http://www.slrclub.com/bbs/zboard.php", source) { - public static int? LatestPage { get; set; } + public static int? LatestPage { get; private set; } protected override string UrlComposite(int page) { @@ -18,13 +19,11 @@ protected override string UrlComposite(int page) public override async Task RunAsync() { - var crawlerInstance = Create(); - // 전체 페이지를 알아오기 위한 SlrClub용 우회이므로, 그냥 1페이지를 호출한다. - await ExecuteAsync(crawlerInstance, 1); + await ExecuteAsync(1); } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var tdContent = document.QuerySelectorAll("tbody tr td table tbody tr td span").Select(x => x.TextContent.Trim()).ToArray(); if (tdContent.Length == 0) diff --git a/WebCrawler/Crawler/TodayhumorCrawler.cs b/WebCrawler/Crawler/TodayhumorCrawler.cs index 99abfbb..040f1c7 100644 --- a/WebCrawler/Crawler/TodayhumorCrawler.cs +++ b/WebCrawler/Crawler/TodayhumorCrawler.cs @@ -6,6 +6,7 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; +using AngleSharp.Dom; using WebCrawler.Models; namespace WebCrawler.Crawler @@ -18,7 +19,7 @@ protected override string UrlComposite(int page) return $"{UrlBase}?table={Source.BoardId}&page={page}"; } - protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) + protected override void OnPageCrawl(IDocument document) { var thContent = document.QuerySelectorAll("thead tr th") .Select(x => x.TextContent.Trim()) @@ -58,7 +59,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) var href = UrlCompositeHref(tdHref[n]); - ConcurrentBag.Add(OnCrawlData(new CrawlingData + _ = OnCrawlData(new CrawlingData { Type = Source.Type, BoardId = Source.BoardId, @@ -71,7 +72,7 @@ protected override void OnPageCrawl(AngleSharp.Html.Dom.IHtmlDocument document) DateTime = date, Href = href, SourceId = Source.Id - }).Result); + }); }); } } diff --git a/WebCrawler/WebCrawler.csproj b/WebCrawler/WebCrawler.csproj index 5e05089..d260067 100644 --- a/WebCrawler/WebCrawler.csproj +++ b/WebCrawler/WebCrawler.csproj @@ -5,7 +5,7 @@ - + diff --git a/web-crawler.sln.DotSettings b/web-crawler.sln.DotSettings deleted file mode 100644 index 825520e..0000000 --- a/web-crawler.sln.DotSettings +++ /dev/null @@ -1,5 +0,0 @@ - - True - True - True - True \ No newline at end of file