Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Text fragment feature (#1545) #1600

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,9 @@ Options:
--include-fragments
Enable the checking of fragments in links

--include-text-fragments
Enable the checking of Text Fragments in links

-t, --timeout <TIMEOUT>
Website timeout in seconds from connect to response finished

Expand Down
1 change: 1 addition & 0 deletions lychee-bin/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc<CookieStoreMutex>>) -
.require_https(cfg.require_https)
.cookie_jar(cookie_jar.cloned())
.include_fragments(cfg.include_fragments)
.include_text_fragments(cfg.include_text_fragments)
.fallback_extensions(cfg.fallback_extensions.clone())
.build()
.client()
Expand Down
6 changes: 6 additions & 0 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,11 @@ separated list of accepted status codes. This example will accept 200, 201,
#[serde(default)]
pub(crate) include_fragments: bool,

/// Enable the checking of Text Fragments in links
#[arg(long)]
#[serde(default)]
pub(crate) include_text_fragments: bool,

/// Website timeout in seconds from connect to response finished
#[arg(short, long, default_value = &TIMEOUT_STR)]
#[serde(default = "timeout")]
Expand Down Expand Up @@ -568,6 +573,7 @@ impl Config {
require_https: false;
cookie_jar: None;
include_fragments: false;
include_text_fragments: false;
accept: StatusCodeSelector::default();
}

Expand Down
1 change: 1 addition & 0 deletions lychee-lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ cached = "0.54.0"
check-if-email-exists = { version = "0.9.1", optional = true }
cookie_store = "0.21.1"
email_address = "0.2.9"
fancy-regex = "0.14.0"
futures = "0.3.31"
glob = "0.3.2"
headers = "0.4.0"
Expand Down
37 changes: 36 additions & 1 deletion lychee-lib/src/checker/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@ use crate::{
quirks::Quirks,
retry::RetryExt,
types::uri::github::GithubUri,
utils::url::UrlExt,
BasicAuthCredentials, ErrorKind, Status, Uri,
};
use async_trait::async_trait;
use http::StatusCode;
use log::info;
use octocrab::Octocrab;
use reqwest::Request;
use std::{collections::HashSet, time::Duration};
Expand Down Expand Up @@ -41,6 +43,9 @@ pub(crate) struct WebsiteChecker {
///
/// This would treat unencrypted links as errors when HTTPS is available.
require_https: bool,

/// Verify Text Fragments for a website
validate_text_fragments: bool,
}

impl WebsiteChecker {
Expand All @@ -53,6 +58,7 @@ impl WebsiteChecker {
accepted: Option<HashSet<StatusCode>>,
github_client: Option<Octocrab>,
require_https: bool,
validate_text_fragments: bool,
plugin_request_chain: RequestChain,
) -> Self {
Self {
Expand All @@ -64,6 +70,7 @@ impl WebsiteChecker {
retry_wait_time,
accepted,
require_https,
validate_text_fragments,
}
}

Expand All @@ -86,9 +93,37 @@ impl WebsiteChecker {
}

/// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
///
/// If Fragment Directive check is enabled and the URL has fragment directive,
/// Fragment Directive checker is run to validate the response against directives given
///
/// # Errors
/// - if the fragment directive check fails, return one of the `TextFragmentStatus` error code
async fn check_default(&self, request: Request) -> Status {
let req_url = request.url().clone();
let has_fragment_directive = req_url.has_fragment_directive();

match self.reqwest_client.execute(request).await {
Ok(ref response) => Status::new(response, self.accepted.clone()),
Ok(response) => {
let mut status = Status::new(&response, self.accepted.clone());
if self.validate_text_fragments && has_fragment_directive {
if let Ok(res) = response.text().await {
info!("checking fragment directive...");
if let Some(fd) = req_url.fragment_directive() {
info!("directive: {:?}", fd.text_directives);
match fd.check(&res) {
Ok(stat) => {
status = stat;
}
Err(e) => {
return e.into();
}
}
}
}
}
status
Comment on lines +108 to +125
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can move that into a function/method?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some tests for that part would also be nice

}
Err(e) => e.into(),
}
}
Expand Down
64 changes: 64 additions & 0 deletions lychee-lib/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,9 @@ pub struct ClientBuilder {
/// Enable the checking of fragments in links.
include_fragments: bool,

/// Enable the checking of text fragment in a website
include_text_fragments: bool,

/// Requests run through this chain where each item in the chain
/// can modify the request. A chained item can also decide to exit
/// early and return a status, so that subsequent chain items are
Expand Down Expand Up @@ -391,6 +394,7 @@ impl ClientBuilder {
self.accepted,
github_client,
self.require_https,
self.include_text_fragments,
self.plugin_request_chain,
);

Expand Down Expand Up @@ -665,6 +669,7 @@ mod tests {
// Same, but ignore certificate error
let res = ClientBuilder::builder()
.allow_insecure(true)
.include_text_fragments(true)
.build()
.client()
.unwrap()
Expand Down Expand Up @@ -841,6 +846,65 @@ mod tests {
assert!(res.status().is_unsupported());
}

#[tokio::test]
async fn test_fragment_directive() {
let client = ClientBuilder::builder()
.include_text_fragments(true)
.build()
.client()
.unwrap();

// Start only
println!("testing START only directive...");
let res = client.check("https://developer.mozilla.org/en-US/docs/Web/URI/Fragment/Text_fragments#:~:text=without%20relying%20on%20the%20presence%20of%20IDs").await.unwrap();
assert!(res.status().is_success());

// Start and End
println!("\ntesting START and END directive...");
let res = client.check("https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#:~:text=linked%20URL,defining%20a%20value").await.unwrap();
assert!(res.status().is_success());

// Prefix and start
println!("\ntesting Prefix with START...");
let res = client
.check("https://example.com/#:~:text=asking-,for")
.await
.unwrap();
assert!(res.status().is_success());

// start with suffix
println!("\ntesting start with suffix...");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you'll probably remove the println!s right?

let res = client.check("https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#:~:text=linked%20URL\'s,-format").await.unwrap();
assert!(res.status().is_success());

let res = client.check("https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#:~:text=downgrade:-,The%20Referer,be%20sent,-to%20origins").await.unwrap();
assert!(res.status().is_success());
}

#[tokio::test]
async fn test_multiple_directives() {
let client = ClientBuilder::builder()
.include_text_fragments(true)
.build()
.client()
.unwrap();

let res = client.check("https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#:~:text=causes&text=linked").await.unwrap();
assert!(res.status().is_success());
}

#[tokio::test]
async fn fail_fragment_directive_test() {
let client = ClientBuilder::builder()
.include_text_fragments(true)
.build()
.client()
.unwrap();

let res = client.check("https://developer.mozilla.org/en-US/docs/Web/URI/Fragment/Text_fragments#:~:text=without%20relying%20on%20the%20presence%20of%20DIs").await.unwrap();
assert!(res.status().is_error());
}

#[tokio::test]
async fn test_max_redirects() {
let mock_server = wiremock::MockServer::start().await;
Expand Down
Loading
Loading