Skip to content

Commit

Permalink
chore: better support for bing
Browse files Browse the repository at this point in the history
  • Loading branch information
Soontao committed May 29, 2024
1 parent bf66073 commit 6dca8b7
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 23 deletions.
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"source.organizeImports": "explicit"
},
"cSpell.words": [
"Sogou"
"Sogou",
"weixin"
]
}
41 changes: 37 additions & 4 deletions src/index.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,47 @@ app.get(
}),
);

app.get(
"/weixin",
createCommonSearchAPI({
urlPrefix: "https://weixin.sogou.com/weixin?type=2&query=",
resultsItemSelector: ".news-box .news-list li",
titleSelector: "h3 a",
linkSelector: "h3 a",
descriptionSelector: ".txt-info",
}),
);

app.get(
"/zhihu",
createCommonSearchAPI({
urlPrefix: "https://sogou.com/web?insite=zhihu.com&query=",
resultsItemSelector: ".results .vrwrap",
titleSelector: "h3 a",
linkSelector: "h3 a",
descriptionSelector: ".str-text-info",
}),
);

app.get(
"/baike",
createCommonSearchAPI({
urlPrefix: "https://sogou.com/web?insite=baike.baidu.com&query=",
resultsItemSelector: ".results .vrwrap",
titleSelector: "h3 a",
linkSelector: "h3 a",
descriptionSelector: ".str-text-info",
}),
);

app.get(
"/bing",
createCommonSearchAPI({
urlPrefix: "https://cn.bing.com/search?q=",
resultsItemSelector: "#b_results .b_algo",
titleSelector: ".b_tpcn a",
linkSelector: ".b_tpcn a",
descriptionSelector: ".tptxt",
resultsItemSelector: "#b_content #b_results li.b_algo",
titleSelector: "h2 a",
linkSelector: "h2 a",
descriptionSelector: ".b_caption, p.b_algoSlug",
}),
);

Expand Down
43 changes: 25 additions & 18 deletions src/utils.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,16 @@ export function createCommonSearchAPI(options) {

await page.goto(`${options.urlPrefix}${search}`, {
waitUntil: "networkidle0",
timeout: 30_000,
referer: "https://cn.bing.com/",
timeout: 10_000,
referer: options.urlPrefix,
});

const cards = await page.$$(options.resultsItemSelector);

const searchResults = await Promise.all(
cards.map(async (card) => {
const item = await card.evaluate((node, options) => {
const linkEle = node.querySelector(options.linkSelector);
const link = linkEle?.href;
cards.map((card) => {
return card.evaluate((node, options) => {
const link = node.querySelector(options.linkSelector)?.href;
const title = node.querySelector(options.titleSelector)?.innerText;
const description = node.querySelector(
options.descriptionSelector,
Expand All @@ -63,35 +62,43 @@ export function createCommonSearchAPI(options) {
description,
};
}, options);
if (!item?.link) return;
if (!process.env.TF_URL) {
console.warn("TF_URL is not set, skipping text extraction");
return item;
}
}),
);

const validSearchResults = searchResults
.filter((i) => i.link)
.slice(0, parseInt(req.query.top ?? 100));

await browser.close();

if (!process.env.TF_URL) {
console.warn("TF_URL is not set, skipping text extraction");
return validSearchResults;
}

await Promise.all(
validSearchResults.map(async (item) => {
try {
const res = await axios.post(
process.env.TF_URL + "/extract",
{ url: item.link },
{
headers: {
"Content-Type": "application/json",
"User-Agent": defaultUserAgent(),
},
timeout: 5_000,
timeout: 2_000,
},
);
if (res.status < 300 && res.data?.text) {
return { ...item, text: res.data };
item.text = res.data.text;
}
} catch (error) {
console.error("Failed to extract text", error.message, item.link);
}

return item;
}),
);
await browser.close();
return res.json(searchResults.filter(Boolean));

return res.json(validSearchResults.filter(Boolean));
});
}

Expand Down

0 comments on commit 6dca8b7

Please sign in to comment.