Skip to content

Commit

Permalink
docker: Use external chrome docker container
Browse files Browse the repository at this point in the history
  • Loading branch information
MohamedBassem committed Mar 24, 2024
1 parent f8bc4dd commit 4ab0815
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 33 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,6 @@ jobs:
target: ${{ matrix.package }}
platforms: linux/amd64,linux/arm64
push: true
tags: ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:${{github.event.release.name}},ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:release
tags: ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:${{ github.event.release.name }},ghcr.io/mohamedbassem/hoarder-${{ matrix.package }}:release
cache-from: type=registry,ref=ghcr.io/mohamedbassem/hoarder-build-cache:${{ matrix.package }}
cache-to: type=registry,mode=max,ref=ghcr.io/mohamedbassem/hoarder-build-cache:${{ matrix.package }}
50 changes: 40 additions & 10 deletions apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import assert from "assert";
import * as dns from "dns";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import { Job, Worker } from "bullmq";
import DOMPurify from "dompurify";
import { eq } from "drizzle-orm";
import { isShuttingDown, shutdownPromise } from "exit";
import { isShuttingDown } from "exit";
import { JSDOM } from "jsdom";
import metascraper from "metascraper";
import metascraperDescription from "metascraper-description";
Expand Down Expand Up @@ -50,11 +51,38 @@ const browserMutex = new Mutex();
async function launchBrowser() {
browser = undefined;
await browserMutex.runExclusive(async () => {
browser = await puppeteer.launch({
headless: serverConfig.crawler.headlessBrowser,
executablePath: serverConfig.crawler.browserExecutablePath,
userDataDir: serverConfig.crawler.browserUserDataDir,
});
try {
if (serverConfig.crawler.browserWebUrl) {
logger.info(
`Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
);
const webUrl = new URL(serverConfig.crawler.browserWebUrl);
// We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
const { address: address } = await dns.promises.lookup(webUrl.hostname);
webUrl.hostname = address;
logger.info(
`Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
browser = await puppeteer.connect({
browserURL: webUrl.toString(),
});
} else {
logger.info(`Launching a new browser instance`);
browser = await puppeteer.launch({
headless: serverConfig.crawler.headlessBrowser,
executablePath: serverConfig.crawler.browserExecutablePath,
userDataDir: serverConfig.crawler.browserUserDataDir,
});
}
} catch (e) {
logger.error(
"Failed to connect to the browser instance, will retry in 5 secs",
);
setTimeout(() => {
launchBrowser();
}, 5000);
return;
}
browser.on("disconnected", async () => {
if (isShuttingDown) {
logger.info(
Expand Down Expand Up @@ -91,13 +119,15 @@ export class CrawlerWorker {
);

worker.on("completed", (job) => {
const jobId = job?.id || "unknown";
const jobId = job?.id ?? "unknown";
logger.info(`[Crawler][${jobId}] Completed successfully`);
});

worker.on("failed", (job, error) => {
const jobId = job?.id || "unknown";
logger.error(`[Crawler][${jobId}] Crawling job failed: ${error}`);
const jobId = job?.id ?? "unknown";
logger.error(
`[Crawler][${jobId}] Crawling job failed: ${JSON.stringify(error)}`,
);
});

return worker;
Expand Down Expand Up @@ -160,7 +190,7 @@ async function crawlPage(url: string) {
}

async function runCrawler(job: Job<ZCrawlLinkRequest, void>) {
const jobId = job.id || "unknown";
const jobId = job.id ?? "unknown";

const request = zCrawlLinkRequestSchema.safeParse(job.data);
if (!request.success) {
Expand Down
9 changes: 0 additions & 9 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,10 @@ RUN --mount=type=cache,id=pnpm_workers,target=/pnpm/store pnpm deploy --node-lin
FROM --platform=$BUILDPLATFORM node:21-alpine AS workers
WORKDIR /app

# Install chromium needed for puppeteer
RUN apk add --no-cache chromium runuser
ENV CHROME_PATH "/usr/bin/chromium-browser"
ENV BROWSER_EXECUTABLE_PATH "/app/start-chrome.sh"
ENV BROWSER_USER_DATA_DIR="/tmp/chrome"

COPY --from=workers_builder /prod apps/workers

RUN corepack enable

ADD docker/start-chrome.sh .
RUN chmod +x start-chrome.sh

WORKDIR /app/apps/workers

USER root
Expand Down
7 changes: 1 addition & 6 deletions docker/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
FROM node:21-alpine

RUN apk add --no-cache libc6-compat chromium runuser make g++ py3-pip linux-headers
RUN apk add --no-cache libc6-compat make g++ py3-pip linux-headers
ENV PUPPETEER_SKIP_DOWNLOAD true
ENV CHROME_PATH "/usr/bin/chromium-browser"
ENV BROWSER_EXECUTABLE_PATH "/bin/start-chrome.sh"
ENV BROWSER_USER_DATA_DIR="/tmp/chrome"

WORKDIR /app
ADD start-chrome.sh /bin
RUN chmod +x /bin/start-chrome.sh
9 changes: 9 additions & 0 deletions docker/docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ services:
image: redis:7.2-alpine
volumes:
- redis:/data
chrome:
image: gcr.io/zenika-hub/alpine-chrome:100
restart: unless-stopped
command:
- --no-sandbox
- --disable-gpu
- --remote-debugging-address=0.0.0.0
- --remote-debugging-port=9222
meilisearch:
image: getmeili/meilisearch:v1.6
volumes:
Expand All @@ -37,6 +45,7 @@ services:
environment:
REDIS_HOST: redis
MEILI_ADDR: http://meilisearch:7700
BROWSER_WEB_URL: http://chrome:9222
DATA_DIR: /data
# OPENAI_API_KEY: ...
command:
Expand Down
9 changes: 9 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ services:
restart: unless-stopped
volumes:
- redis:/data
chrome:
image: gcr.io/zenika-hub/alpine-chrome:100
restart: unless-stopped
command:
- --no-sandbox
- --disable-gpu
- --remote-debugging-address=0.0.0.0
- --remote-debugging-port=9222
meilisearch:
image: getmeili/meilisearch:v1.6
restart: unless-stopped
Expand All @@ -35,6 +43,7 @@ services:
environment:
REDIS_HOST: redis
MEILI_ADDR: http://meilisearch:7700
BROWSER_WEB_URL: http://chrome:9222
DATA_DIR: /data
# OPENAI_API_KEY: ...
depends_on:
Expand Down
7 changes: 0 additions & 7 deletions docker/start-chrome.sh

This file was deleted.

1 change: 1 addition & 0 deletions packages/shared/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const serverConfig = {
headlessBrowser: (process.env.CRAWLER_HEADLESS_BROWSER ?? "true") == "true",
browserExecutablePath: process.env.BROWSER_EXECUTABLE_PATH, // If not set, the system's browser will be used
browserUserDataDir: process.env.BROWSER_USER_DATA_DIR,
browserWebUrl: process.env.BROWSER_WEB_URL,
},
meilisearch: process.env.MEILI_ADDR
? {
Expand Down

0 comments on commit 4ab0815

Please sign in to comment.