Skip to content

Commit

Permalink
Add Wikipedia as a search engine
Browse files Browse the repository at this point in the history
Currently, it only search in the English wikipedia, but it can be
customized to use different ones. UI would be needed
  • Loading branch information
gzsombor committed Nov 27, 2024
1 parent ef0ae2f commit 0583835
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 12 deletions.
6 changes: 2 additions & 4 deletions src/engines/bing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{Report, Result, ResultExt};

use super::common::build_cookie;
use super::search_result_parser::SearchResultParser;

/// A new Bing engine type defined in-order to implement the `SearchEngine` trait which allows to
Expand Down Expand Up @@ -73,10 +74,7 @@ impl SearchEngine for Bing {
("_UR=QS=0&TQS", "0"),
];

let mut cookie_string = String::new();
for (k, v) in &query_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
let cookie_string = build_cookie(&query_params);

let header_map = HeaderMap::try_from(&HashMap::from([
("User-Agent".to_string(), user_agent.to_string()),
Expand Down
24 changes: 24 additions & 0 deletions src/engines/common.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

//! This module provides common functionalities for engines

/**
* Build a query from a list of key value pairs.
*/
pub fn build_query(query_params: &[(&str, &str)]) -> String {
let mut query_params_string = String::new();
for (k, v) in query_params {
query_params_string.push_str(&format!("&{k}={v}"));
}
query_params_string
}

/**
* Build a cookie from a list of key value pairs.
*/
pub fn build_cookie(cookie_params: &[(&str, &str)]) -> String {
let mut cookie_string = String::new();
for (k, v) in cookie_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
cookie_string
}
2 changes: 2 additions & 0 deletions src/engines/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
//! code. Moreover, it also provides a custom error for the upstream search engine handling code.

pub mod bing;
pub mod common;
pub mod brave;
pub mod duckduckgo;
pub mod librex;
pub mod mojeek;
pub mod search_result_parser;
pub mod searx;
pub mod startpage;
pub mod wikipedia;
11 changes: 3 additions & 8 deletions src/engines/mojeek.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{Report, Result, ResultExt};

use super::common::{build_cookie, build_query};
use super::search_result_parser::SearchResultParser;

/// A new Mojeek engine type defined in-order to implement the `SearchEngine` trait which allows to
Expand Down Expand Up @@ -107,10 +108,7 @@ impl SearchEngine for Mojeek {
("safe", &safe),
];

let mut query_params_string = String::new();
for (k, v) in &query_params {
query_params_string.push_str(&format!("&{k}={v}"));
}
let query_params_string = build_query(&query_params);

let url: String = match page {
0 => {
Expand All @@ -123,10 +121,7 @@ impl SearchEngine for Mojeek {
}
};

let mut cookie_string = String::new();
for (k, v) in &query_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
let cookie_string = build_cookie(&query_params);

let header_map = HeaderMap::try_from(&HashMap::from([
("User-Agent".to_string(), user_agent.to_string()),
Expand Down
103 changes: 103 additions & 0 deletions src/engines/wikipedia.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
//! The `wikipedia` module handles the scraping of results from wikipedia
//! with user provided query and with a page number if provided.

use std::collections::HashMap;

use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;

use crate::models::aggregation_models::SearchResult;

use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{Report, Result, ResultExt};

use super::common::build_query;
use super::search_result_parser::SearchResultParser;

/// A new Wikipedia engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Wikipedia {
/// The parser, used to interpret the search result.
parser: SearchResultParser,
id: String,
host: String,
}

impl Wikipedia {
/// Creates the Wikipedia parser.
pub fn new(language: String) -> Result<Self, EngineError> {
let host = format!("https://{}.wikipedia.org", &language);
let id = format!("wikipedia-{}", &language);
Ok(Self {
parser: SearchResultParser::new(
"p.mw-search-nonefound",
".mw-search-results li.mw-search-result",
".mw-search-result-heading a",
".mw-search-result-heading a",
".searchresult",
)?,
id,
host,
})
}
}

#[async_trait::async_trait]
impl SearchEngine for Wikipedia {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
_safe_search: u8,
) -> Result<Vec<(String, SearchResult)>, EngineError> {

let header_map = HeaderMap::try_from(&HashMap::from([
("User-Agent".to_string(), user_agent.to_string()),
("Referer".to_string(), self.host.to_string()),
]))
.change_context(EngineError::UnexpectedError)?;

let offset = (page * 20).to_string();
let query_params: Vec<(&str, &str)> = vec![
("limit", "20"),
("offset", &offset),
("profile", "default"),
("search", query),
("title", "Special:Search"),
("ns0", "1")
];


let query_params_string = build_query(&query_params);

let url: String = format!("{}/w/index.php?{}", self.host, query_params_string);

let document: Html = Html::parse_document(
&Wikipedia::fetch_html_from_upstream(self, &url, header_map, client).await?,
);

if let Some(_) = self.parser.parse_for_no_results(&document).next() {
return Err(Report::new(EngineError::EmptyResultSet));
}

// scrape all the results from the html
self.parser
.parse_for_results(&document, |title, url, desc| {
let found_url = url.attr("href");
if let Some(relative_url) = found_url {
Some(SearchResult::new(
title.inner_html().trim(),
&format!("{}{relative_url}", self.host),
desc.inner_html().trim(),
&[&self.id],
))
} else {
None
}
})
}
}
4 changes: 4 additions & 0 deletions src/models/engine_models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ impl EngineHandler {
let engine = crate::engines::bing::Bing::new()?;
("bing", Box::new(engine))
}
"wikipedia" => {
let engine = crate::engines::wikipedia::Wikipedia::new("en".to_string())?;
("wikipedia", Box::new(engine))
}
_ => {
return Err(Report::from(EngineError::NoSuchEngineFound(
engine_name.to_string(),
Expand Down
1 change: 1 addition & 0 deletions websurfx/config.lua
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ upstream_search_engines = {
LibreX = false,
Mojeek = false,
Bing = false,
Wikipedia = true,
} -- select the upstream search engines from which the results should be fetched.

proxy = nil -- Proxy to send outgoing requests through. Set to nil to disable.

0 comments on commit 0583835

Please sign in to comment.