Skip to content

Commit

Permalink
Add Wikipedia as a search engine
Browse files Browse the repository at this point in the history
Currently, it only search in the English wikipedia, but it can be
customized to use different ones. UI would be needed
  • Loading branch information
gzsombor committed Nov 27, 2024
1 parent ef0ae2f commit b105652
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 12 deletions.
6 changes: 2 additions & 4 deletions src/engines/bing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{Report, Result, ResultExt};

use super::common::build_cookie;
use super::search_result_parser::SearchResultParser;

/// A new Bing engine type defined in-order to implement the `SearchEngine` trait which allows to
Expand Down Expand Up @@ -73,10 +74,7 @@ impl SearchEngine for Bing {
("_UR=QS=0&TQS", "0"),
];

let mut cookie_string = String::new();
for (k, v) in &query_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
let cookie_string = build_cookie(&query_params);

let header_map = HeaderMap::try_from(&HashMap::from([
("User-Agent".to_string(), user_agent.to_string()),
Expand Down
23 changes: 23 additions & 0 deletions src/engines/common.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//! This module provides common functionalities for engines

/**
* Build a query from a list of key value pairs.
*/
pub fn build_query(query_params: &[(&str, &str)]) -> String {
let mut query_params_string = String::new();
for (k, v) in query_params {
query_params_string.push_str(&format!("&{k}={v}"));
}
query_params_string
}

/**
* Build a cookie from a list of key value pairs.
*/
pub fn build_cookie(cookie_params: &[(&str, &str)]) -> String {
let mut cookie_string = String::new();
for (k, v) in cookie_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
cookie_string
}
2 changes: 2 additions & 0 deletions src/engines/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@

pub mod bing;
pub mod brave;
pub mod common;
pub mod duckduckgo;
pub mod librex;
pub mod mojeek;
pub mod search_result_parser;
pub mod searx;
pub mod startpage;
pub mod wikipedia;
11 changes: 3 additions & 8 deletions src/engines/mojeek.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{Report, Result, ResultExt};

use super::common::{build_cookie, build_query};
use super::search_result_parser::SearchResultParser;

/// A new Mojeek engine type defined in-order to implement the `SearchEngine` trait which allows to
Expand Down Expand Up @@ -107,10 +108,7 @@ impl SearchEngine for Mojeek {
("safe", &safe),
];

let mut query_params_string = String::new();
for (k, v) in &query_params {
query_params_string.push_str(&format!("&{k}={v}"));
}
let query_params_string = build_query(&query_params);

let url: String = match page {
0 => {
Expand All @@ -123,10 +121,7 @@ impl SearchEngine for Mojeek {
}
};

let mut cookie_string = String::new();
for (k, v) in &query_params {
cookie_string.push_str(&format!("{k}={v}; "));
}
let cookie_string = build_cookie(&query_params);

let header_map = HeaderMap::try_from(&HashMap::from([
("User-Agent".to_string(), user_agent.to_string()),
Expand Down
101 changes: 101 additions & 0 deletions src/engines/wikipedia.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
//! The `wikipedia` module handles the scraping of results from wikipedia
//! with user provided query and with a page number if provided.

use std::collections::HashMap;

use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;

use crate::models::aggregation_models::SearchResult;

use crate::models::engine_models::{EngineError, SearchEngine};

use error_stack::{Report, Result, ResultExt};

use super::common::build_query;
use super::search_result_parser::SearchResultParser;

/// A new Wikipedia engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Wikipedia {
/// The parser, used to interpret the search result.
parser: SearchResultParser,
id: String,

Check failure on line 24 in src/engines/wikipedia.rs

View workflow job for this annotation

GitHub Actions / Rust project

missing documentation for a struct field
host: String,

Check failure on line 25 in src/engines/wikipedia.rs

View workflow job for this annotation

GitHub Actions / Rust project

missing documentation for a struct field
}

impl Wikipedia {
/// Creates the Wikipedia parser.
pub fn new(language: String) -> Result<Self, EngineError> {
let host = format!("https://{}.wikipedia.org", &language);
let id = format!("wikipedia-{}", &language);
Ok(Self {
parser: SearchResultParser::new(
"p.mw-search-nonefound",
".mw-search-results li.mw-search-result",
".mw-search-result-heading a",
".mw-search-result-heading a",
".searchresult",
)?,
id,
host,
})
}
}

#[async_trait::async_trait]
impl SearchEngine for Wikipedia {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
_safe_search: u8,
) -> Result<Vec<(String, SearchResult)>, EngineError> {
let header_map = HeaderMap::try_from(&HashMap::from([
("User-Agent".to_string(), user_agent.to_string()),
("Referer".to_string(), self.host.to_string()),
]))
.change_context(EngineError::UnexpectedError)?;

let offset = (page * 20).to_string();
let query_params: Vec<(&str, &str)> = vec![
("limit", "20"),
("offset", &offset),
("profile", "default"),
("search", query),
("title", "Special:Search"),
("ns0", "1"),
];

let query_params_string = build_query(&query_params);

let url: String = format!("{}/w/index.php?{}", self.host, query_params_string);

let document: Html = Html::parse_document(
&Wikipedia::fetch_html_from_upstream(self, &url, header_map, client).await?,
);

if let Some(_) = self.parser.parse_for_no_results(&document).next() {

Check warning on line 81 in src/engines/wikipedia.rs

View workflow job for this annotation

GitHub Actions / Rust project

redundant pattern matching, consider using `is_some()`
return Err(Report::new(EngineError::EmptyResultSet));
}

// scrape all the results from the html
self.parser
.parse_for_results(&document, |title, url, desc| {
let found_url = url.attr("href");
if let Some(relative_url) = found_url {

Check warning on line 89 in src/engines/wikipedia.rs

View workflow job for this annotation

GitHub Actions / Rust project

manual implementation of `Option::map`
Some(SearchResult::new(
title.inner_html().trim(),
&format!("{}{relative_url}", self.host),
desc.inner_html().trim(),
&[&self.id],
))
} else {
None
}
})
}
}
4 changes: 4 additions & 0 deletions src/models/engine_models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ impl EngineHandler {
let engine = crate::engines::bing::Bing::new()?;
("bing", Box::new(engine))
}
"wikipedia" => {
let engine = crate::engines::wikipedia::Wikipedia::new("en".to_string())?;
("wikipedia", Box::new(engine))
}
_ => {
return Err(Report::from(EngineError::NoSuchEngineFound(
engine_name.to_string(),
Expand Down
1 change: 1 addition & 0 deletions websurfx/config.lua
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ upstream_search_engines = {
LibreX = false,
Mojeek = false,
Bing = false,
Wikipedia = true,
} -- select the upstream search engines from which the results should be fetched.

proxy = nil -- Proxy to send outgoing requests through. Set to nil to disable.

0 comments on commit b105652

Please sign in to comment.