Skip to content

Commit

Permalink
feat: with content
Browse files Browse the repository at this point in the history
  • Loading branch information
Soontao committed May 29, 2024
1 parent 26771c5 commit 681fe37
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 54 deletions.
11 changes: 6 additions & 5 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
version: "3"
services:
app:
build:
context: .
dockerfile: Dockerfile
search-api:
image: theosun/browser-search-api:latest
environment:
NODE_ENV: production
PW_REMOTE_URL: ws://browserless:3000?token=us5W3elErSluCvyCwMPzC0ntA
TF_URL: http://trafilatura-api:5000
restart: always
browserless:
image: ghcr.io/browserless/chromium
Expand All @@ -18,3 +16,6 @@ services:
- HEALTH=true
- QUEUED=20
restart: always
trafilatura-api:
image: theosun/trafilatura-api:latest
restart: always
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"prettier": "^3.2.5"
},
"dependencies": {
"@newdash/newdash": "^5.22.1",
"express": "^4.19.2",
"puppeteer-core": "^22.9.0",
"turndown": "^7.1.3",
Expand Down
60 changes: 11 additions & 49 deletions src/index.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ import express from "express";
import process from "process";
import puppeteer from "puppeteer-core";
import { fetch } from "undici";
import { asyncExpressMiddleware, defaultUserAgent } from "./utils.mjs";
import {
asyncExpressMiddleware,
createCommonSearchAPI,
defaultUserAgent,
} from "./utils.mjs";

if (process.env.PW_REMOTE_URL === undefined) {
console.error("PW_REMOTE_URL is required");
Expand Down Expand Up @@ -69,54 +73,12 @@ app.get(

app.get(
"/bing",
asyncExpressMiddleware(async (req, res) => {
const { search } = req.query;
const browser = await puppeteer.connect({
browserWSEndpoint: process.env.PW_REMOTE_URL,
});
const page = await browser.newPage();
await page.setUserAgent(defaultUserAgent());
await page.goto(`https://cn.bing.com/search?q=${search}`, {
waitUntil: "networkidle0",
timeout: 30_000,
referer: "https://cn.bing.com/",
});
const results = await page.$("#b_results");
const cards = await results.$$(".b_algo");

const refLinks = await Promise.all(
cards.map(async (card) => {
const item = await card.evaluate((node) => {
const linkEle = node.querySelector(".b_tpcn a");
if (!linkEle) return;
const link = linkEle.href;

// get text
const title = linkEle.innerText;
const description = node.querySelector(".tptxt")?.innerText;
return {
title,
link,
description,
};
});
if (!item?.link) return;
const res = await fetch(process.env.TF_URL + "/extract", {
method: "POST",
headers: {
"Content-Type": "application/json",
"User-Agent": defaultUserAgent(),
},
body: JSON.stringify({ url: item.link }),
});
if (!res.ok) return item;
const data = await res.json();
if (!data.text) return item;
return { ...item, text: data.text };
}),
);
await browser.close();
return res.json(refLinks.filter(Boolean));
createCommonSearchAPI({
urlPrefix: "https://cn.bing.com/search?q=",
resultsItemSelector: "#b_results .b_algo",
titleSelector: ".b_tpcn a",
linkSelector: ".b_tpcn a",
descriptionSelector: ".tptxt",
}),
);

Expand Down
67 changes: 67 additions & 0 deletions src/utils.mjs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import console from "console";
import process from "process";
import puppeteer from "puppeteer-core";
import { fetch } from "undici";

/**
*
Expand All @@ -14,6 +17,70 @@ export function asyncExpressMiddleware(fn) {
};
}

/**
*
* @param {{
* urlPrefix: string
* resultsItemSelector: string,
* titleSelector: string,
* linkSelector: string,
* descriptionSelector: string
* }} options
* @returns
*/
export function createCommonSearchAPI(options) {
return asyncExpressMiddleware(async (req, res) => {
const { search } = req.query;
const browser = await puppeteer.connect({
browserWSEndpoint: process.env.PW_REMOTE_URL,
});
const page = await browser.newPage();
await page.setUserAgent(defaultUserAgent());

await page.goto(`${options.urlPrefix}${search}`, {
waitUntil: "networkidle0",
timeout: 30_000,
referer: "https://cn.bing.com/",
});

const cards = await page.$$(options.resultsItemSelector);

const searchResults = await Promise.all(
cards.map(async (card) => {
const item = await card.evaluate((node, options) => {
const linkEle = node.querySelector(options.linkSelector);
const link = linkEle?.href;
const title = node.querySelector(options.titleSelector)?.innerText;
const description = node.querySelector(
options.descriptionSelector,
)?.innerText;
return {
title,
link,
description,
};
}, options);
if (!item?.link) return;
const res = await fetch(process.env.TF_URL + "/extract", {
method: "POST",
headers: {
"Content-Type": "application/json",
"User-Agent": defaultUserAgent(),
},
body: JSON.stringify({ url: item.link }),
});
const data = await res.json();
if (res.ok && data.text) {
return { ...item, text: data.text };
}
return item;
}),
);
await browser.close();
return res.json(searchResults.filter(Boolean));
});
}

export function defaultUserAgent() {
return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36";
}

0 comments on commit 681fe37

Please sign in to comment.