From 681fe3706a39773296fe8370dedfe0175dcf2972 Mon Sep 17 00:00:00 2001 From: Theo Sun Date: Wed, 29 May 2024 21:16:05 +0800 Subject: [PATCH] feat: with content --- docker-compose.yaml | 11 ++++---- package.json | 1 + src/index.mjs | 60 ++++++++-------------------------------- src/utils.mjs | 67 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 54 deletions(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 081b1e2..4a805d2 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,12 +1,10 @@ -version: "3" services: - app: - build: - context: . - dockerfile: Dockerfile + search-api: + image: theosun/browser-search-api:latest environment: NODE_ENV: production PW_REMOTE_URL: ws://browserless:3000?token=us5W3elErSluCvyCwMPzC0ntA + TF_URL: http://trafilatura-api:5000 restart: always browserless: image: ghcr.io/browserless/chromium @@ -18,3 +16,6 @@ services: - HEALTH=true - QUEUED=20 restart: always + trafilatura-api: + image: theosun/trafilatura-api:latest + restart: always diff --git a/package.json b/package.json index 9b28c52..232c03b 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "prettier": "^3.2.5" }, "dependencies": { + "@newdash/newdash": "^5.22.1", "express": "^4.19.2", "puppeteer-core": "^22.9.0", "turndown": "^7.1.3", diff --git a/src/index.mjs b/src/index.mjs index 3bc9676..8bf2fa8 100644 --- a/src/index.mjs +++ b/src/index.mjs @@ -3,7 +3,11 @@ import express from "express"; import process from "process"; import puppeteer from "puppeteer-core"; import { fetch } from "undici"; -import { asyncExpressMiddleware, defaultUserAgent } from "./utils.mjs"; +import { + asyncExpressMiddleware, + createCommonSearchAPI, + defaultUserAgent, +} from "./utils.mjs"; if (process.env.PW_REMOTE_URL === undefined) { console.error("PW_REMOTE_URL is required"); @@ -69,54 +73,12 @@ app.get( app.get( "/bing", - asyncExpressMiddleware(async (req, res) => { - const { search } = req.query; - const browser = await puppeteer.connect({ - browserWSEndpoint: process.env.PW_REMOTE_URL, - }); - const page = await browser.newPage(); - await page.setUserAgent(defaultUserAgent()); - await page.goto(`https://cn.bing.com/search?q=${search}`, { - waitUntil: "networkidle0", - timeout: 30_000, - referer: "https://cn.bing.com/", - }); - const results = await page.$("#b_results"); - const cards = await results.$$(".b_algo"); - - const refLinks = await Promise.all( - cards.map(async (card) => { - const item = await card.evaluate((node) => { - const linkEle = node.querySelector(".b_tpcn a"); - if (!linkEle) return; - const link = linkEle.href; - - // get text - const title = linkEle.innerText; - const description = node.querySelector(".tptxt")?.innerText; - return { - title, - link, - description, - }; - }); - if (!item?.link) return; - const res = await fetch(process.env.TF_URL + "/extract", { - method: "POST", - headers: { - "Content-Type": "application/json", - "User-Agent": defaultUserAgent(), - }, - body: JSON.stringify({ url: item.link }), - }); - if (!res.ok) return item; - const data = await res.json(); - if (!data.text) return item; - return { ...item, text: data.text }; - }), - ); - await browser.close(); - return res.json(refLinks.filter(Boolean)); + createCommonSearchAPI({ + urlPrefix: "https://cn.bing.com/search?q=", + resultsItemSelector: "#b_results .b_algo", + titleSelector: ".b_tpcn a", + linkSelector: ".b_tpcn a", + descriptionSelector: ".tptxt", }), ); diff --git a/src/utils.mjs b/src/utils.mjs index 29de85f..62bf316 100644 --- a/src/utils.mjs +++ b/src/utils.mjs @@ -1,4 +1,7 @@ import console from "console"; +import process from "process"; +import puppeteer from "puppeteer-core"; +import { fetch } from "undici"; /** * @@ -14,6 +17,70 @@ export function asyncExpressMiddleware(fn) { }; } +/** + * + * @param {{ + * urlPrefix: string + * resultsItemSelector: string, + * titleSelector: string, + * linkSelector: string, + * descriptionSelector: string + * }} options + * @returns + */ +export function createCommonSearchAPI(options) { + return asyncExpressMiddleware(async (req, res) => { + const { search } = req.query; + const browser = await puppeteer.connect({ + browserWSEndpoint: process.env.PW_REMOTE_URL, + }); + const page = await browser.newPage(); + await page.setUserAgent(defaultUserAgent()); + + await page.goto(`${options.urlPrefix}${search}`, { + waitUntil: "networkidle0", + timeout: 30_000, + referer: "https://cn.bing.com/", + }); + + const cards = await page.$$(options.resultsItemSelector); + + const searchResults = await Promise.all( + cards.map(async (card) => { + const item = await card.evaluate((node, options) => { + const linkEle = node.querySelector(options.linkSelector); + const link = linkEle?.href; + const title = node.querySelector(options.titleSelector)?.innerText; + const description = node.querySelector( + options.descriptionSelector, + )?.innerText; + return { + title, + link, + description, + }; + }, options); + if (!item?.link) return; + const res = await fetch(process.env.TF_URL + "/extract", { + method: "POST", + headers: { + "Content-Type": "application/json", + "User-Agent": defaultUserAgent(), + }, + body: JSON.stringify({ url: item.link }), + }); + const data = await res.json(); + if (res.ok && data.text) { + return { ...item, text: data.text }; + } + return item; + }), + ); + await browser.close(); + return res.json(searchResults.filter(Boolean)); + }); +} + export function defaultUserAgent() { return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"; }