Skip to content

Commit

Permalink
improve event scraper code
Browse files Browse the repository at this point in the history
  • Loading branch information
Karel-Kroeze committed Oct 9, 2023
1 parent eceea29 commit 1f4645d
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 9 deletions.
16 changes: 16 additions & 0 deletions _utils/calendar_sync/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions _utils/calendar_sync/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"dependencies": {
"@types/node": "^20.5.1",
"dayjs": "^1.11.10",
"html-entities": "^2.4.0",
"ical.js": "^1.5.0",
"node-html-parser": "^6.1.10",
"yaml": "^2.3.1"
Expand Down
34 changes: 25 additions & 9 deletions _utils/calendar_sync/ut-events-scraper.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
const fs = require("fs/promises");
const yaml = require("yaml");
const { parse } = require("node-html-parser");
const { decode } = require("html-entities");

const dayjs = require("dayjs");
const customParseFormat = require("dayjs/plugin/customParseFormat");
Expand All @@ -15,11 +16,14 @@ dayjs.extend(customParseFormat);
dayjs.tz.setDefault(TIMEZONE);

async function getEvents(url, source, out) {
const response = await fetch(url);
const response = await fetch(url, {
headers: { "content-encoding": "utf8" },
});
const body = await response.text();

const root = parse(body);
let events = root
.querySelectorAll("li.summary__introblock")
.querySelectorAll("li.summary__introblock, .summary__item")
.map(async (li) => {
const event_url = li.querySelector("a.summary__link").attributes["href"];
const img_url = new URL(
Expand All @@ -29,17 +33,17 @@ async function getEvents(url, source, out) {
const title = li.querySelector(".summary__title").innerText.trim();
const description = li
.querySelector(".summary__description")
.innerText.trim();
const date = li.querySelector(".summary__date").innerText.trim();
const [startDate, endDate] = await getEventTimes(event_url);
?.innerText.trim();
// const date = li.querySelector(".summary__date, .summary__meta .date")?.innerText.trim();
const [startDate, endDate, _description] = await getEventTimes(event_url);

return {
title,
description,
title: decode(title),
description: decode(description ?? _description),
source,
url: event_url,
img_url: img_url.href,
date,
// date,
start: startDate,
end: endDate,
};
Expand All @@ -60,6 +64,12 @@ async function getEvents(url, source, out) {
event.end = event.end.format();
}

// filter out duplicates
evaluated_events = evaluated_events.filter(
(ev, index, events) =>
index == events.findIndex((other_ev) => ev.title == other_ev.title)
);

// write to outfile
await fs.writeFile(out, yaml.stringify(evaluated_events));
}
Expand All @@ -73,8 +83,14 @@ async function getEventTimes(url) {
.querySelector("div.addeventatc span.start")
.innerText.trim();
const end = root.querySelector("div.addeventatc span.end").innerText.trim();
const desc =
root.querySelector(".contentpart__main p")?.innerText.trim() ?? "";

return [dayjs(start, "DD/MM/YYYY HH:mm"), dayjs(end, "DD/MM/YYYY HH:mm")];
return [
dayjs(start, "DD/MM/YYYY HH:mm"),
dayjs(end, "DD/MM/YYYY HH:mm"),
desc,
];
}

function main() {
Expand Down

0 comments on commit 1f4645d

Please sign in to comment.