From 4ab081511b328085d6c7c9990da2abc1a6e920e1 Mon Sep 17 00:00:00 2001 From: MohamedBassem Date: Sun, 24 Mar 2024 01:00:24 +0000 Subject: [PATCH] docker: Use external chrome docker container --- .github/workflows/docker.yml | 2 +- apps/workers/crawlerWorker.ts | 50 ++++++++++++++++++++++++++++------- docker/Dockerfile | 9 ------- docker/Dockerfile.dev | 7 +---- docker/docker-compose.dev.yml | 9 +++++++ docker/docker-compose.yml | 9 +++++++ docker/start-chrome.sh | 7 ----- packages/shared/config.ts | 1 + 8 files changed, 61 insertions(+), 33 deletions(-) delete mode 100644 docker/start-chrome.sh diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 134842e5..72a82d33 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -49,6 +49,6 @@ jobs: target: ${{ matrix.package }} platforms: linux/amd64,linux/arm64 push: true - tags: ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:${{github.event.release.name}},ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:release + tags: ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:${{ github.event.release.name }},ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:release cache-from: type=registry,ref=ghcr.io/mohamedbassem/hoarder-build-cache:${{ matrix.package }} cache-to: type=registry,mode=max,ref=ghcr.io/mohamedbassem/hoarder-build-cache:${{ matrix.package }} diff --git a/apps/workers/crawlerWorker.ts b/apps/workers/crawlerWorker.ts index eb4a0697..282f5f43 100644 --- a/apps/workers/crawlerWorker.ts +++ b/apps/workers/crawlerWorker.ts @@ -1,10 +1,11 @@ import assert from "assert"; +import * as dns from "dns"; import { Readability } from "@mozilla/readability"; import { Mutex } from "async-mutex"; import { Job, Worker } from "bullmq"; import DOMPurify from "dompurify"; import { eq } from "drizzle-orm"; -import { isShuttingDown, shutdownPromise } from "exit"; +import { isShuttingDown } from "exit"; import { JSDOM } from "jsdom"; import metascraper from "metascraper"; import metascraperDescription from "metascraper-description"; @@ -50,11 +51,38 @@ const browserMutex = new Mutex(); async function launchBrowser() { browser = undefined; await browserMutex.runExclusive(async () => { - browser = await puppeteer.launch({ - headless: serverConfig.crawler.headlessBrowser, - executablePath: serverConfig.crawler.browserExecutablePath, - userDataDir: serverConfig.crawler.browserUserDataDir, - }); + try { + if (serverConfig.crawler.browserWebUrl) { + logger.info( + `Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`, + ); + const webUrl = new URL(serverConfig.crawler.browserWebUrl); + // We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242 + const { address: address } = await dns.promises.lookup(webUrl.hostname); + webUrl.hostname = address; + logger.info( + `Successfully resolved IP address, new address: ${webUrl.toString()}`, + ); + browser = await puppeteer.connect({ + browserURL: webUrl.toString(), + }); + } else { + logger.info(`Launching a new browser instance`); + browser = await puppeteer.launch({ + headless: serverConfig.crawler.headlessBrowser, + executablePath: serverConfig.crawler.browserExecutablePath, + userDataDir: serverConfig.crawler.browserUserDataDir, + }); + } + } catch (e) { + logger.error( + "Failed to connect to the browser instance, will retry in 5 secs", + ); + setTimeout(() => { + launchBrowser(); + }, 5000); + return; + } browser.on("disconnected", async () => { if (isShuttingDown) { logger.info( @@ -91,13 +119,15 @@ export class CrawlerWorker { ); worker.on("completed", (job) => { - const jobId = job?.id || "unknown"; + const jobId = job?.id ?? "unknown"; logger.info(`[Crawler][${jobId}] Completed successfully`); }); worker.on("failed", (job, error) => { - const jobId = job?.id || "unknown"; - logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`); + const jobId = job?.id ?? "unknown"; + logger.error( + `[Crawler][${jobId}] Crawling job failed: ${JSON.stringify(error)}`, + ); }); return worker; @@ -160,7 +190,7 @@ async function crawlPage(url: string) { } async function runCrawler(job: Job) { - const jobId = job.id || "unknown"; + const jobId = job.id ?? "unknown"; const request = zCrawlLinkRequestSchema.safeParse(job.data); if (!request.success) { diff --git a/docker/Dockerfile b/docker/Dockerfile index 2164dc77..05432cbe 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -72,19 +72,10 @@ RUN --mount=type=cache,id=pnpm_workers,target=/pnpm/store pnpm deploy --node-lin FROM --platform=$BUILDPLATFORM node:21-alpine AS workers WORKDIR /app -# Install chromium needed for puppeteer -RUN apk add --no-cache chromium runuser -ENV CHROME_PATH "/usr/bin/chromium-browser" -ENV BROWSER_EXECUTABLE_PATH "/app/start-chrome.sh" -ENV BROWSER_USER_DATA_DIR="/tmp/chrome" - COPY --from=workers_builder /prod apps/workers RUN corepack enable -ADD docker/start-chrome.sh . -RUN chmod +x start-chrome.sh - WORKDIR /app/apps/workers USER root diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 9a8de32b..cd15b20d 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -1,11 +1,6 @@ FROM node:21-alpine -RUN apk add --no-cache libc6-compat chromium runuser make g++ py3-pip linux-headers +RUN apk add --no-cache libc6-compat make g++ py3-pip linux-headers ENV PUPPETEER_SKIP_DOWNLOAD true -ENV CHROME_PATH "/usr/bin/chromium-browser" -ENV BROWSER_EXECUTABLE_PATH "/bin/start-chrome.sh" -ENV BROWSER_USER_DATA_DIR="/tmp/chrome" WORKDIR /app -ADD start-chrome.sh /bin -RUN chmod +x /bin/start-chrome.sh diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index d7cbbbf0..80547930 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -23,6 +23,14 @@ services: image: redis:7.2-alpine volumes: - redis:/data + chrome: + image: gcr.io/zenika-hub/alpine-chrome:100 + restart: unless-stopped + command: + - --no-sandbox + - --disable-gpu + - --remote-debugging-address=0.0.0.0 + - --remote-debugging-port=9222 meilisearch: image: getmeili/meilisearch:v1.6 volumes: @@ -37,6 +45,7 @@ services: environment: REDIS_HOST: redis MEILI_ADDR: http://meilisearch:7700 + BROWSER_WEB_URL: http://chrome:9222 DATA_DIR: /data # OPENAI_API_KEY: ... command: diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 03cb5a82..51c564b8 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -18,6 +18,14 @@ services: restart: unless-stopped volumes: - redis:/data + chrome: + image: gcr.io/zenika-hub/alpine-chrome:100 + restart: unless-stopped + command: + - --no-sandbox + - --disable-gpu + - --remote-debugging-address=0.0.0.0 + - --remote-debugging-port=9222 meilisearch: image: getmeili/meilisearch:v1.6 restart: unless-stopped @@ -35,6 +43,7 @@ services: environment: REDIS_HOST: redis MEILI_ADDR: http://meilisearch:7700 + BROWSER_WEB_URL: http://chrome:9222 DATA_DIR: /data # OPENAI_API_KEY: ... depends_on: diff --git a/docker/start-chrome.sh b/docker/start-chrome.sh deleted file mode 100644 index 9f715906..00000000 --- a/docker/start-chrome.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -set -x; -id -u chrome &>/dev/null || adduser -S chrome; -mkdir -p $BROWSER_USER_DATA_DIR; -chown chrome $BROWSER_USER_DATA_DIR; -runuser -u chrome -- $CHROME_PATH --no-sandbox $@; diff --git a/packages/shared/config.ts b/packages/shared/config.ts index 25806ae0..e12c55c2 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -14,6 +14,7 @@ const serverConfig = { headlessBrowser: (process.env.CRAWLER_HEADLESS_BROWSER ?? "true") == "true", browserExecutablePath: process.env.BROWSER_EXECUTABLE_PATH, // If not set, the system's browser will be used browserUserDataDir: process.env.BROWSER_USER_DATA_DIR, + browserWebUrl: process.env.BROWSER_WEB_URL, }, meilisearch: process.env.MEILI_ADDR ? {