From 1f4645d855b842226d222ff402272dc58328502c Mon Sep 17 00:00:00 2001 From: "Karel A. Kroeze" Date: Mon, 9 Oct 2023 14:57:25 +0200 Subject: [PATCH] improve event scraper code --- _utils/calendar_sync/package-lock.json | 16 +++++++++++ _utils/calendar_sync/package.json | 1 + _utils/calendar_sync/ut-events-scraper.js | 34 +++++++++++++++++------ 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/_utils/calendar_sync/package-lock.json b/_utils/calendar_sync/package-lock.json index 8598a83f1..42ae1a92d 100644 --- a/_utils/calendar_sync/package-lock.json +++ b/_utils/calendar_sync/package-lock.json @@ -11,6 +11,7 @@ "dependencies": { "@types/node": "^20.5.1", "dayjs": "^1.11.10", + "html-entities": "^2.4.0", "ical.js": "^1.5.0", "node-html-parser": "^6.1.10", "yaml": "^2.3.1" @@ -127,6 +128,21 @@ "he": "bin/he" } }, + "node_modules/html-entities": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.4.0.tgz", + "integrity": "sha512-igBTJcNNNhvZFRtm8uA6xMY6xYleeDwn3PeBCkDz7tHttv4F2hsDI2aPgNERWzvRcNYHNT3ymRaQzllmXj4YsQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/mdevils" + }, + { + "type": "patreon", + "url": "https://patreon.com/mdevils" + } + ] + }, "node_modules/ical.js": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/ical.js/-/ical.js-1.5.0.tgz", diff --git a/_utils/calendar_sync/package.json b/_utils/calendar_sync/package.json index efbf0a44a..f60edd698 100644 --- a/_utils/calendar_sync/package.json +++ b/_utils/calendar_sync/package.json @@ -12,6 +12,7 @@ "dependencies": { "@types/node": "^20.5.1", "dayjs": "^1.11.10", + "html-entities": "^2.4.0", "ical.js": "^1.5.0", "node-html-parser": "^6.1.10", "yaml": "^2.3.1" diff --git a/_utils/calendar_sync/ut-events-scraper.js b/_utils/calendar_sync/ut-events-scraper.js index 0b1c4b18a..c42cc01f5 100644 --- a/_utils/calendar_sync/ut-events-scraper.js +++ b/_utils/calendar_sync/ut-events-scraper.js @@ -1,6 +1,7 @@ const fs = require("fs/promises"); const yaml = require("yaml"); const { parse } = require("node-html-parser"); +const { decode } = require("html-entities"); const dayjs = require("dayjs"); const customParseFormat = require("dayjs/plugin/customParseFormat"); @@ -15,11 +16,14 @@ dayjs.extend(customParseFormat); dayjs.tz.setDefault(TIMEZONE); async function getEvents(url, source, out) { - const response = await fetch(url); + const response = await fetch(url, { + headers: { "content-encoding": "utf8" }, + }); const body = await response.text(); + const root = parse(body); let events = root - .querySelectorAll("li.summary__introblock") + .querySelectorAll("li.summary__introblock, .summary__item") .map(async (li) => { const event_url = li.querySelector("a.summary__link").attributes["href"]; const img_url = new URL( @@ -29,17 +33,17 @@ async function getEvents(url, source, out) { const title = li.querySelector(".summary__title").innerText.trim(); const description = li .querySelector(".summary__description") - .innerText.trim(); - const date = li.querySelector(".summary__date").innerText.trim(); - const [startDate, endDate] = await getEventTimes(event_url); + ?.innerText.trim(); + // const date = li.querySelector(".summary__date, .summary__meta .date")?.innerText.trim(); + const [startDate, endDate, _description] = await getEventTimes(event_url); return { - title, - description, + title: decode(title), + description: decode(description ?? _description), source, url: event_url, img_url: img_url.href, - date, + // date, start: startDate, end: endDate, }; @@ -60,6 +64,12 @@ async function getEvents(url, source, out) { event.end = event.end.format(); } + // filter out duplicates + evaluated_events = evaluated_events.filter( + (ev, index, events) => + index == events.findIndex((other_ev) => ev.title == other_ev.title) + ); + // write to outfile await fs.writeFile(out, yaml.stringify(evaluated_events)); } @@ -73,8 +83,14 @@ async function getEventTimes(url) { .querySelector("div.addeventatc span.start") .innerText.trim(); const end = root.querySelector("div.addeventatc span.end").innerText.trim(); + const desc = + root.querySelector(".contentpart__main p")?.innerText.trim() ?? ""; - return [dayjs(start, "DD/MM/YYYY HH:mm"), dayjs(end, "DD/MM/YYYY HH:mm")]; + return [ + dayjs(start, "DD/MM/YYYY HH:mm"), + dayjs(end, "DD/MM/YYYY HH:mm"), + desc, + ]; } function main() {