diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index a969ab86..cce409e5 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -56,14 +56,14 @@ async function launchBrowser() { try { if (serverConfig.crawler.browserWebUrl) { logger.info( - `Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`, + `[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`, ); const webUrl = new URL(serverConfig.crawler.browserWebUrl); // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242 const { address: address } = await dns.promises.lookup(webUrl.hostname); webUrl.hostname = address; logger.info( - `Successfully resolved IP address, new address: ${webUrl.toString()}`, + `[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`, ); browser = await puppeteer.connect({ browserURL: webUrl.toString(), @@ -76,7 +76,7 @@ async function launchBrowser() { } } catch (e) { logger.error( - "Failed to connect to the browser instance, will retry in 5 secs", + "[Crawler] Failed to connect to the browser instance, will retry in 5 secs", ); setTimeout(() => { launchBrowser(); @@ -86,12 +86,12 @@ async function launchBrowser() { browser.on("disconnected", () => { if (isShuttingDown) { logger.info( - "The puppeteer browser got disconnected. But we're shutting down so won't restart it.", + "[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.", ); return; } logger.info( - "The puppeteer browser got disconnected. Will attempt to launch it again.", + "[Crawler] The puppeteer browser got disconnected. Will attempt to launch it again.", ); launchBrowser(); }); @@ -111,7 +111,10 @@ export class CrawlerWorker { logger.info("Starting crawler worker ..."); const worker = new Worker( LinkCrawlerQueue.name, - withTimeout(runCrawler, /* timeoutSec */ 30), + withTimeout( + runCrawler, + /* timeoutSec */ serverConfig.crawler.jobTimeoutSec, + ), { connection: queueConnectionDetails, autorun: false, @@ -125,9 +128,7 @@ export class CrawlerWorker { worker.on("failed", (job, error) => { const jobId = job?.id ?? "unknown"; - logger.error( - `[Crawler][${jobId}] Crawling job failed: ${JSON.stringify(error)}`, - ); + logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`); }); return worker; @@ -161,7 +162,7 @@ function validateUrl(url: string) { } } -async function crawlPage(url: string) { +async function crawlPage(jobId: string, url: string) { assert(browser); const context = await browser.createBrowserContext(); @@ -171,6 +172,9 @@ async function crawlPage(url: string) { await page.goto(url, { timeout: 10000, // 10 seconds }); + logger.info( + `[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`, + ); // Wait until there's at most two connections for 2 seconds // Attempt to wait only for 5 seconds @@ -182,6 +186,8 @@ async function crawlPage(url: string) { new Promise((f) => setTimeout(f, 5000)), ]); + logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`); + const htmlContent = await page.content(); return htmlContent; } finally { @@ -208,12 +214,16 @@ async function runCrawler(job: Job) { ); validateUrl(url); - const htmlContent = await crawlPage(url); + const htmlContent = await crawlPage(jobId, url); + logger.info( + `[Crawler][${jobId}] Will attempt to parse the content of the page ...`, + ); const meta = await metascraperParser({ url, html: htmlContent, }); + logger.info(`[Crawler][${jobId}] Done parsing the content of the page.`); const window = new JSDOM("").window; const purify = DOMPurify(window); diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 8bf8a069..1307bcfd 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -34,3 +34,9 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin | INFERENCE_TEXT_MODEL | No | gpt-3.5-turbo-0125 | The model to use for text inference. You'll need to change this to some other model if you're using ollama. | | INFERENCE_IMAGE_MODEL | No | gpt-4-vision-preview | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava). | | INFERENCE_LANG | No | english | The language in which the tags will be generated. | + +## Crawler Configs + +| Name | Required | Default | Description | +| ----------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit | diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 11140c3b..75274a4e 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -20,6 +20,7 @@ const allEnv = z.object({ REDIS_DB_IDX: z.coerce.number().optional(), CRAWLER_HEADLESS_BROWSER: stringBool("true"), BROWSER_WEB_URL: z.string().url().optional(), + CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60), MEILI_ADDR: z.string().optional(), MEILI_MASTER_KEY: z.string().default(""), LOG_LEVEL: z.string().default("debug"), @@ -56,6 +57,7 @@ const serverConfigSchema = allEnv.transform((val) => { crawler: { headlessBrowser: val.CRAWLER_HEADLESS_BROWSER, browserWebUrl: val.BROWSER_WEB_URL, + jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC, }, meilisearch: val.MEILI_ADDR ? {