From 6dca8b7d556bc2741072d49d5974dc698bd29f94 Mon Sep 17 00:00:00 2001 From: Theo Sun Date: Wed, 29 May 2024 23:08:44 +0800 Subject: [PATCH] chore: better support for bing --- .vscode/settings.json | 3 ++- src/index.mjs | 41 +++++++++++++++++++++++++++++++++++++---- src/utils.mjs | 43 +++++++++++++++++++++++++------------------ 3 files changed, 64 insertions(+), 23 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index b90e0f6..389c87a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,6 +7,7 @@ "source.organizeImports": "explicit" }, "cSpell.words": [ - "Sogou" + "Sogou", + "weixin" ] } \ No newline at end of file diff --git a/src/index.mjs b/src/index.mjs index bc749b0..b2afd11 100644 --- a/src/index.mjs +++ b/src/index.mjs @@ -21,14 +21,47 @@ app.get( }), ); +app.get( + "/weixin", + createCommonSearchAPI({ + urlPrefix: "https://weixin.sogou.com/weixin?type=2&query=", + resultsItemSelector: ".news-box .news-list li", + titleSelector: "h3 a", + linkSelector: "h3 a", + descriptionSelector: ".txt-info", + }), +); + +app.get( + "/zhihu", + createCommonSearchAPI({ + urlPrefix: "https://sogou.com/web?insite=zhihu.com&query=", + resultsItemSelector: ".results .vrwrap", + titleSelector: "h3 a", + linkSelector: "h3 a", + descriptionSelector: ".str-text-info", + }), +); + +app.get( + "/baike", + createCommonSearchAPI({ + urlPrefix: "https://sogou.com/web?insite=baike.baidu.com&query=", + resultsItemSelector: ".results .vrwrap", + titleSelector: "h3 a", + linkSelector: "h3 a", + descriptionSelector: ".str-text-info", + }), +); + app.get( "/bing", createCommonSearchAPI({ urlPrefix: "https://cn.bing.com/search?q=", - resultsItemSelector: "#b_results .b_algo", - titleSelector: ".b_tpcn a", - linkSelector: ".b_tpcn a", - descriptionSelector: ".tptxt", + resultsItemSelector: "#b_content #b_results li.b_algo", + titleSelector: "h2 a", + linkSelector: "h2 a", + descriptionSelector: ".b_caption, p.b_algoSlug", }), ); diff --git a/src/utils.mjs b/src/utils.mjs index 2fb6318..ffde0be 100644 --- a/src/utils.mjs +++ b/src/utils.mjs @@ -42,17 +42,16 @@ export function createCommonSearchAPI(options) { await page.goto(`${options.urlPrefix}${search}`, { waitUntil: "networkidle0", - timeout: 30_000, - referer: "https://cn.bing.com/", + timeout: 10_000, + referer: options.urlPrefix, }); const cards = await page.$$(options.resultsItemSelector); const searchResults = await Promise.all( - cards.map(async (card) => { - const item = await card.evaluate((node, options) => { - const linkEle = node.querySelector(options.linkSelector); - const link = linkEle?.href; + cards.map((card) => { + return card.evaluate((node, options) => { + const link = node.querySelector(options.linkSelector)?.href; const title = node.querySelector(options.titleSelector)?.innerText; const description = node.querySelector( options.descriptionSelector, @@ -63,11 +62,22 @@ export function createCommonSearchAPI(options) { description, }; }, options); - if (!item?.link) return; - if (!process.env.TF_URL) { - console.warn("TF_URL is not set, skipping text extraction"); - return item; - } + }), + ); + + const validSearchResults = searchResults + .filter((i) => i.link) + .slice(0, parseInt(req.query.top ?? 100)); + + await browser.close(); + + if (!process.env.TF_URL) { + console.warn("TF_URL is not set, skipping text extraction"); + return validSearchResults; + } + + await Promise.all( + validSearchResults.map(async (item) => { try { const res = await axios.post( process.env.TF_URL + "/extract", @@ -75,23 +85,20 @@ export function createCommonSearchAPI(options) { { headers: { "Content-Type": "application/json", - "User-Agent": defaultUserAgent(), }, - timeout: 5_000, + timeout: 2_000, }, ); if (res.status < 300 && res.data?.text) { - return { ...item, text: res.data }; + item.text = res.data.text; } } catch (error) { console.error("Failed to extract text", error.message, item.link); } - - return item; }), ); - await browser.close(); - return res.json(searchResults.filter(Boolean)); + + return res.json(validSearchResults.filter(Boolean)); }); }