Skip to content

Commit

Permalink
fix(workers): Increase default timeout to 60s, make it configurable a…
Browse files Browse the repository at this point in the history
…nd improve logging
  • Loading branch information
MohamedBassem committed Apr 6, 2024
1 parent e0bb1fc commit 4491831
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 11 deletions.
32 changes: 21 additions & 11 deletions apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,14 @@ async function launchBrowser() {
try {
if (serverConfig.crawler.browserWebUrl) {
logger.info(
`Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
`[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
);
const webUrl = new URL(serverConfig.crawler.browserWebUrl);
// We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
const { address: address } = await dns.promises.lookup(webUrl.hostname);
webUrl.hostname = address;
logger.info(
`Successfully resolved IP address, new address: ${webUrl.toString()}`,
`[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
browser = await puppeteer.connect({
browserURL: webUrl.toString(),
Expand All @@ -76,7 +76,7 @@ async function launchBrowser() {
}
} catch (e) {
logger.error(
"Failed to connect to the browser instance, will retry in 5 secs",
"[Crawler] Failed to connect to the browser instance, will retry in 5 secs",
);
setTimeout(() => {
launchBrowser();
Expand All @@ -86,12 +86,12 @@ async function launchBrowser() {
browser.on("disconnected", () => {
if (isShuttingDown) {
logger.info(
"The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
"[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
);
return;
}
logger.info(
"The puppeteer browser got disconnected. Will attempt to launch it again.",
"[Crawler] The puppeteer browser got disconnected. Will attempt to launch it again.",
);
launchBrowser();
});
Expand All @@ -111,7 +111,10 @@ export class CrawlerWorker {
logger.info("Starting crawler worker ...");
const worker = new Worker<ZCrawlLinkRequest, void>(
LinkCrawlerQueue.name,
withTimeout(runCrawler, /* timeoutSec */ 30),
withTimeout(
runCrawler,
/* timeoutSec */ serverConfig.crawler.jobTimeoutSec,
),
{
connection: queueConnectionDetails,
autorun: false,
Expand All @@ -125,9 +128,7 @@ export class CrawlerWorker {

worker.on("failed", (job, error) => {
const jobId = job?.id ?? "unknown";
logger.error(
`[Crawler][${jobId}] Crawling job failed: ${JSON.stringify(error)}`,
);
logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`);
});

return worker;
Expand Down Expand Up @@ -161,7 +162,7 @@ function validateUrl(url: string) {
}
}

async function crawlPage(url: string) {
async function crawlPage(jobId: string, url: string) {
assert(browser);
const context = await browser.createBrowserContext();

Expand All @@ -171,6 +172,9 @@ async function crawlPage(url: string) {
await page.goto(url, {
timeout: 10000, // 10 seconds
});
logger.info(
`[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
);

// Wait until there's at most two connections for 2 seconds
// Attempt to wait only for 5 seconds
Expand All @@ -182,6 +186,8 @@ async function crawlPage(url: string) {
new Promise((f) => setTimeout(f, 5000)),
]);

logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`);

const htmlContent = await page.content();
return htmlContent;
} finally {
Expand All @@ -208,12 +214,16 @@ async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
);
validateUrl(url);

const htmlContent = await crawlPage(url);
const htmlContent = await crawlPage(jobId, url);

logger.info(
`[Crawler][${jobId}] Will attempt to parse the content of the page ...`,
);
const meta = await metascraperParser({
url,
html: htmlContent,
});
logger.info(`[Crawler][${jobId}] Done parsing the content of the page.`);

const window = new JSDOM("").window;
const purify = DOMPurify(window);
Expand Down
6 changes: 6 additions & 0 deletions docs/docs/03-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,9 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin
| INFERENCE_TEXT_MODEL | No | gpt-3.5-turbo-0125 | The model to use for text inference. You'll need to change this to some other model if you're using ollama. |
| INFERENCE_IMAGE_MODEL | No | gpt-4-vision-preview | The model to use for image inference. You'll need to change this to some other model if you're using ollama and that model needs to support vision APIs (e.g. llava). |
| INFERENCE_LANG | No | english | The language in which the tags will be generated. |

## Crawler Configs

| Name | Required | Default | Description |
| ----------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
2 changes: 2 additions & 0 deletions packages/shared/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ const allEnv = z.object({
REDIS_DB_IDX: z.coerce.number().optional(),
CRAWLER_HEADLESS_BROWSER: stringBool("true"),
BROWSER_WEB_URL: z.string().url().optional(),
CRAWLER_JOB_TIMEOUT_SEC: z.number().default(60),
MEILI_ADDR: z.string().optional(),
MEILI_MASTER_KEY: z.string().default(""),
LOG_LEVEL: z.string().default("debug"),
Expand Down Expand Up @@ -56,6 +57,7 @@ const serverConfigSchema = allEnv.transform((val) => {
crawler: {
headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
browserWebUrl: val.BROWSER_WEB_URL,
jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
},
meilisearch: val.MEILI_ADDR
? {
Expand Down

0 comments on commit 4491831

Please sign in to comment.