From b1b6d5e066a3534a19d01149d620f86e18e413d1 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Wed, 22 Jun 2022 15:34:26 +1000 Subject: [PATCH 01/13] Start working on checking domain names Came up in a couple of places: #41, #29, #38, #28. Hopefully we can fix all of these with these changes. Not done yet, still want to have domain checking for URLs with certain schemes (https) but allow everything for others. If we do that, we may be able to unify the email and plain domain parsing with the scheme one too. --- src/domains.rs | 80 ++++++++++++++++++ src/email.rs | 49 ++--------- src/lib.rs | 1 + src/url.rs | 211 ++++++++++++++++++++++++++++++++++++----------- tests/domains.rs | 139 +++++++++++++++++++++++++++++++ tests/email.rs | 2 +- tests/url.rs | 11 +-- 7 files changed, 397 insertions(+), 96 deletions(-) create mode 100644 src/domains.rs create mode 100644 tests/domains.rs diff --git a/src/domains.rs b/src/domains.rs new file mode 100644 index 0000000..4e35766 --- /dev/null +++ b/src/domains.rs @@ -0,0 +1,80 @@ +//! Domain name related scanning, used by both email and plain domain URL scanners. + +pub(crate) fn find_domain_end(s: &str) -> (Option, Option) { + let mut end = None; + let mut maybe_last_dot = None; + let mut last_dot = None; + let mut dot_allowed = false; + let mut hyphen_allowed = false; + let mut all_numeric = true; + + for (i, c) in s.char_indices() { + let can_be_last = match c { + 'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { + // Can start or end a domain label. + dot_allowed = true; + hyphen_allowed = true; + last_dot = maybe_last_dot; + all_numeric = false; + + true + } + '0'..='9' => { + // Same as above, except we note if it's + dot_allowed = true; + hyphen_allowed = true; + last_dot = maybe_last_dot; + + true + } + '-' => { + // Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com` + if !hyphen_allowed { + return (None, None); + } + // Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com` + dot_allowed = false; + false + } + '.' => { + if !dot_allowed { + // Label can't be empty, e.g. `.example.com` or `a..com` + return (None, None); + } + dot_allowed = false; + hyphen_allowed = false; + maybe_last_dot = Some(i); + false + } + _ => { + break; + } + }; + + if can_be_last { + end = Some(i + c.len_utf8()); + } + } + + if all_numeric && last_dot.is_none() { + return (None, None); + } + + if !all_numeric { + if let Some(last_dot) = last_dot { + if !valid_tld(&s[last_dot + 1..]) { + return (None, None); + } + } + } + + (end, last_dot) +} + +fn valid_tld(tld: &str) -> bool { + tld.chars() + .take_while(|c| c.is_ascii_alphabetic()) + .take(2) + .count() + >= 2 +} diff --git a/src/email.rs b/src/email.rs index 42a2818..f2a6db1 100644 --- a/src/email.rs +++ b/src/email.rs @@ -1,5 +1,6 @@ use std::ops::Range; +use crate::domains::find_domain_end; use crate::scanner::Scanner; /// Scan for email address starting from the trigger character "@". @@ -40,6 +41,9 @@ impl EmailScanner { break; } atom_boundary = true; + } else if c == '@' { + // In `@me@a.com`, we don't want to extract `me@a.com`. + return None; } else { break; } @@ -49,40 +53,8 @@ impl EmailScanner { // See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531 fn find_end(&self, s: &str) -> Option { - let mut first_in_sub_domain = true; - let mut can_end_sub_domain = false; - let mut first_dot = None; - let mut end = None; - - for (i, c) in s.char_indices() { - if first_in_sub_domain { - if Self::sub_domain_allowed(c) { - end = Some(i + c.len_utf8()); - first_in_sub_domain = false; - can_end_sub_domain = true; - } else { - break; - } - } else if c == '.' { - if !can_end_sub_domain { - break; - } - first_in_sub_domain = true; - if first_dot.is_none() { - first_dot = Some(i); - } - } else if c == '-' { - can_end_sub_domain = false; - } else if Self::sub_domain_allowed(c) { - end = Some(i + c.len_utf8()); - can_end_sub_domain = true; - } else { - break; - } - } - - if let Some(end) = end { - if !self.domain_must_have_dot || first_dot.map(|d| d < end).unwrap_or(false) { + if let (Some(end), last_dot) = find_domain_end(s) { + if !self.domain_must_have_dot || last_dot.is_some() { Some(end) } else { None @@ -120,15 +92,6 @@ impl EmailScanner { _ => c >= '\u{80}', } } - - // See "sub-domain" in RFC 5321. Extension in RFC 6531 is simplified, - // this can also match invalid domains. - fn sub_domain_allowed(c: char) -> bool { - match c { - 'a'..='z' | 'A'..='Z' | '0'..='9' => true, - _ => c >= '\u{80}', - } - } } /// Helper function to check if given string is considered an email address. diff --git a/src/lib.rs b/src/lib.rs index 2edc7fa..fd6a8a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -120,6 +120,7 @@ #![deny(missing_docs)] #![deny(missing_debug_implementations)] +mod domains; mod email; mod finder; mod scanner; diff --git a/src/url.rs b/src/url.rs index 5371d87..b7f18af 100644 --- a/src/url.rs +++ b/src/url.rs @@ -1,5 +1,6 @@ use std::ops::Range; +use crate::domains::find_domain_end; use crate::email; use crate::scanner::Scanner; @@ -14,7 +15,7 @@ const MIN_URL_LENGTH: usize = 4; const QUOTES: &[char] = &['\'', '\"']; -/// Scan for URLs starting from the trigger character ":", requires "://". +/// Scan for URLs starting from the trigger character ":" (requires "://") or "." for plain domains. /// /// Based on RFC 3986. pub struct UrlScanner; @@ -47,55 +48,59 @@ impl Scanner for UrlScanner { }; let after_separator = separator + separator_len; - // When scanning a dot, for URLs without a scheme, do some additional checks - if !is_slash_slash { - // The TLD must be at least 2 characters long - let end = &s[after_separator..]; - let after_last_dot = end - .split('/') - .next() - .unwrap() - .rfind('.') - .map(|pos| pos + 1) - .unwrap_or(0); - let tld_too_short = end[after_last_dot..] - .chars() - .take_while(|c| c.is_ascii_alphabetic()) - .take(2) - .count() - < 2; - if tld_too_short { - return None; + // TODO: Maybe separate out these two things into their own scanners? + // They can still share code, as we're doing between email and URL scanners already anyway. + if is_slash_slash { + // Need at least one character after '//' + if after_separator < s.len() { + if let (Some(start), quote) = self.find_scheme(&s[0..separator]) { + let s = &s[after_separator..]; + if let Some(after_authority) = self.find_authority(s) { + if let Some(end) = self.find_end(&s[after_authority..], quote) { + if after_authority == 0 && end == 0 { + return None; + } + + let range = Range { + start, + end: after_separator + after_authority + end, + }; + return Some(range); + } + } + } } + None + } else { // If this is an email address, don't scan it as URL if email::is_mail(&s[after_separator..]) { return None; } - } - // Need at least one character for scheme, and one after '//' - if after_separator < s.len() { - if let (Some(start), quote) = self.find_start(&s[0..separator], is_slash_slash) { - if let Some(end) = self.find_end(&s[after_separator..], quote) { - let range = Range { - start, - end: after_separator + end, - }; - return Some(range); + if let (Some(start), quote) = self.find_domain_start(&s[0..separator]) { + // Not sure if we should re-use find_authority or do a more strict domain-only + // check here. + + let s = &s[start..]; + if let (Some(domain_end), Some(_)) = self.find_domain_port_end(s) { + if let Some(end) = self.find_end(&s[domain_end..], quote) { + let range = Range { + start, + end: start + domain_end + end, + }; + return Some(range); + } } } + + None } - None } } impl UrlScanner { - // For URL searching starting before the `://` separator, the `has_scheme` parameter should be - // true because the URL will have a scheme for sure. If searching before the `.` separator, it - // should be `false` as we might search over the scheme definition for the scheme being optional. - // See "scheme" in RFC 3986 - fn find_start(&self, s: &str, mut has_scheme: bool) -> (Option, Option) { + fn find_scheme(&self, s: &str) -> (Option, Option) { let mut first = None; let mut special = None; let mut quote = None; @@ -103,11 +108,6 @@ impl UrlScanner { match c { 'a'..='z' | 'A'..='Z' => first = Some(i), '0'..='9' => special = Some(i), - '/' if !has_scheme => special = Some(i), - ':' if !has_scheme => { - has_scheme = true; - special = Some(i) - } '+' | '-' | '.' => {} '@' => return (None, None), c if QUOTES.contains(&c) => { @@ -116,11 +116,6 @@ impl UrlScanner { // https://github.com/robinst/linkify/issues/20 quote = Some(c); } - c if !has_scheme && !matches!(c, '(' | ')' | '[' | ']' | '{' | '}' | ' ') => { - // Detect the start for links using unicode when having links without a scheme, - // then looking for ASCII alpha characters is not enough - first = Some(i); - } _ => break, } } @@ -138,6 +133,127 @@ impl UrlScanner { (first, quote) } + // For URL searching starting before the `://` separator, the `has_scheme` parameter should be + // true because the URL will have a scheme for sure. If searching before the `.` separator, it + // should be `false` as we might search over the scheme definition for the scheme being optional. + // See "scheme" in RFC 3986 + // The rules are basically: + // - Domain is labels separated by `.` + // - Label can not start or end with `-` + // - Label can contain letters, digits, `-` or Unicode + fn find_domain_start(&self, s: &str) -> (Option, Option) { + let mut first = None; + let mut quote = None; + + for (i, c) in s.char_indices().rev() { + match c { + 'a'..='z' | 'A'..='Z' | '0'..='9' | '\u{80}'..=char::MAX => first = Some(i), + // If we had something valid like `https://www.` we'd have found it with the ":" + // scanner already. We don't want to allow `.../www.example.com` just by itself. + // We *could* allow `//www.example.com` (scheme-relative URLs) in the future. + '/' => return (None, None), + // Similar to above, if this was an email we'd have found it already. + '@' => return (None, None), + // TODO: What about ".", do we need to reject it as well? + '-' => { + if first == None { + // Domain label can't end with `-` + return (None, None); + } else { + first = Some(i); + } + } + c if QUOTES.contains(&c) => { + // Check if there's a quote before, and stop once we encounter one of those quotes, + // e.g. with `"www.example.com"` + quote = Some(c); + } + _ => break, + } + } + + if let Some(first) = first { + if s[first..].starts_with('-') { + // Domain label can't start with `-` + return (None, None); + } + } + + (first, quote) + } + + fn find_authority(&self, s: &str) -> Option { + let mut had_userinfo = false; + // let mut maybe_ipv4 = true; + // let mut maybe_domain = true; + let mut can_be_last = true; + + let mut end = Some(0); + // let + // let mut maybe_ipv6 = true; + + for (i, c) in s.char_indices() { + can_be_last = true; + match c { + 'a'..='z' | 'A'..='Z' | '0'..='9' | '\u{80}'..=char::MAX => { + end = Some(i + c.len_utf8()); + } + // unreserved + '-' | '.' | '_' | '~' => { + end = Some(i + c.len_utf8()); + } + // sub-delims + '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => { + end = Some(i + c.len_utf8()); + } + ':' => { + // Could be in userinfo, or we're getting a port now. + if had_userinfo { + // TODO: Just scan for port, then we're done. + } else { + // Allowed in userinfo + } + } + '@' => { + if had_userinfo { + // We already had userinfo, can't have another `@` in a valid authority. + return None; + } else { + // Sike! Everything before this has been userinfo, so let's reset our + // opinions about all the host bits. + had_userinfo = true; + + // maybe_ipv4 = true; + // maybe_domain = true; + + can_be_last = false; + } + } + _ => { + // Anything else, this might be the end of the authority (can be empty). + // Now let the rest of the code handle checking whether the end of the URL is + // valid. + break; + } + } + } + + if !can_be_last { + return None; + } + + end + } + + fn find_domain_port_end(&self, s: &str) -> (Option, Option) { + if let (Some(domain_end), last_dot) = find_domain_end(s) { + // TOOD: Handle port and potential trailing dot + (Some(domain_end), last_dot) + } else { + (None, None) + } + } + fn find_end(&self, s: &str, quote: Option) -> Option { let mut round = 0; let mut square = 0; @@ -145,7 +261,7 @@ impl UrlScanner { let mut single_quote = false; let mut previous_can_be_last = true; - let mut end = None; + let mut end = Some(0); for (i, c) in s.char_indices() { let can_be_last = match c { @@ -181,6 +297,7 @@ impl UrlScanner { } true } + // TODO: Move this to authority parser? '[' => { // Allowed in IPv6 address host square += 1; diff --git a/tests/domains.rs b/tests/domains.rs new file mode 100644 index 0000000..5122339 --- /dev/null +++ b/tests/domains.rs @@ -0,0 +1,139 @@ +//! This is called domains for familiarity but it's about the authority part of URLs as defined in +//! https://datatracker.ietf.org/doc/html/rfc3986#section-3.2 +//! +//! ``` +//! authority = [ userinfo "@" ] host [ ":" port ] +//! +//! +//! userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +//! +//! host = IP-literal / IPv4address / reg-name +//! +//! IP-literal = "[" ( IPv6address / IPvFuture ) "]" +//! +//! IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +//! +//! reg-name = *( unreserved / pct-encoded / sub-delims ) +//! +//! +//! unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +//! +//! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" +//! ``` + +mod common; + +use crate::common::assert_linked_with; +use linkify::LinkFinder; + +#[test] +fn domain_valid() { + // assert_linked("9292.nl", "|9292.nl|"); + // assert_linked("a12.b-c.com", "|a12.b-c.com|"); + // Trailing dot allowed + assert_linked("https://example.com./test", "|https://example.com./test|"); +} + +#[test] +fn domain_with_userinfo() { + assert_linked( + "https://user:pass@example.com/", + "|https://user:pass@example.com/|", + ); + assert_linked( + "https://user:-.!$@example.com/", + "|https://user:-.!$@example.com/|", + ); + + // Can't have another @ + assert_not_linked("https://user:pass@ex@mple.com/"); +} + +#[test] +fn domain_with_port() { + assert_linked("https://localhost:8080/", "|https://localhost:8080/|"); +} + +#[test] +fn domain_with_userinfo_and_port() { + assert_linked( + "https://user:pass@example.com:8080/hi", + "|https://user:pass@example.com:8080/hi|", + ); +} + +#[test] +fn domain_ipv6() { + assert_linked("https://[dontcare]/", "|https://[dontcare]/|"); +} + +#[test] +fn domain_ipv4() { + assert_linked("https://127.0.0.1/", "|https://127.0.0.1/|"); +} + +#[test] +fn domain_labels_cant_be_empty() { + assert_not_linked("www.example..com"); +} + +#[test] +fn domain_labels_cant_start_with_hyphen() { + assert_not_linked("-a.com"); + // assert_not_linked("https://a.-b.com"); +} + +#[test] +fn domain_labels_cant_end_with_hyphen() { + assert_not_linked("a-.com"); + assert_not_linked("a.b-.com"); + + // assert_not_linked("https://a.b-.com"); + // Could also argue that it should be linked without the "-" (like e.g. ";" at the end) + // assert_not_linked("https://example.org-"); +} + +#[test] +fn domain_cant_contain_at() { + // Looks like an email but was recognized as a schemeless link before. + // assert_not_linked("example.com@about"); + // As part of path it's ok. + assert_linked("example.com/@about", "|example.com/@about|"); + // assert_linked("https://example.com/@about", "|https://example.com/@about|"); +} + +#[test] +fn domain_cant_end_numeric() { + assert_not_linked("info@v1.1.1"); +} + +#[test] +fn no_authority_part() { + assert_linked("file:///", "|file:///|"); + assert_linked("file:///home/foo", "|file:///home/foo|"); +} + +#[test] +fn authority_thats_not_domain() { + // Not valid according to DNS but we should allow it for other schemes (or all, not sure). + assert_linked("facetime://+19995551234", "|facetime://+19995551234|"); +} + +#[test] +fn without_scheme_should_stop() { + // assert_linked("ab/example.com", "ab/|example.com|"); + // This is not a valid scheme. Even the schemeless parser should not accept it, nor extract + // only example.com out of it. + assert_not_linked("1abc://example.com"); +} + +fn assert_linked(input: &str, expected: &str) { + let mut finder = LinkFinder::new(); + finder.url_must_have_scheme(false); + + assert_linked_with(&finder, input, expected); +} + +fn assert_not_linked(s: &str) { + assert_linked(s, s); +} diff --git a/tests/email.rs b/tests/email.rs index 8558ad7..59b2178 100644 --- a/tests/email.rs +++ b/tests/email.rs @@ -81,7 +81,7 @@ fn domain_must_have_dot_false() { assert_linked_with(&finder, "a@b", "|a@b|"); assert_linked_with(&finder, "a@b.", "|a@b|."); - assert_linked_with(&finder, "a@b-.", "|a@b|-."); + assert_linked_with(&finder, "a@b.", "|a@b|."); } #[test] diff --git a/tests/url.rs b/tests/url.rs index 65b1aef..c7054c9 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -35,10 +35,10 @@ fn schemes() { #[test] fn authority() { assert_not_linked("ab://"); - assert_not_linked("http://"); - assert_not_linked("http:// "); - assert_not_linked("\"http://\""); - assert_not_linked("\"http://...\", "); + assert_not_linked("file://"); + assert_not_linked("file:// "); + assert_not_linked("\"file://\""); + assert_not_linked("\"file://...\", "); assert_linked("http://a.", "|http://a|."); } @@ -417,7 +417,7 @@ fn international_without_protocol() { fn domain_tld_without_protocol_must_be_long() { assert_urls_without_protocol("example.", "example."); assert_urls_without_protocol("example./", "example./"); - assert_urls_without_protocol("foo.example.", "foo.example."); + assert_urls_without_protocol("foo.com.", "|foo.com|."); assert_urls_without_protocol("example.c", "example.c"); assert_urls_without_protocol("example.co", "|example.co|"); assert_urls_without_protocol("example.com", "|example.com|"); @@ -426,6 +426,7 @@ fn domain_tld_without_protocol_must_be_long() { assert_urls_without_protocol("exampl.e.co", "|exampl.e.co|"); assert_urls_without_protocol("e.xample.c", "e.xample.c"); assert_urls_without_protocol("e.xample.co", "|e.xample.co|"); + assert_urls_without_protocol("v1.1.1", "v1.1.1"); } #[test] From 43dab4073aabe4f1b878663a50b7bcafd74e1baf Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Fri, 24 Jun 2022 14:54:27 +1000 Subject: [PATCH 02/13] Getting close Just need to think about how to handle port at the end, and optional trailing dot. --- src/domains.rs | 201 ++++++++++++++++++++++++++++++++++++++++++++++- src/lib.rs | 2 +- src/url.rs | 78 +++--------------- tests/domains.rs | 42 ++++++++-- tests/url.rs | 4 +- 5 files changed, 247 insertions(+), 80 deletions(-) diff --git a/src/domains.rs b/src/domains.rs index 4e35766..6d83007 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -11,7 +11,7 @@ pub(crate) fn find_domain_end(s: &str) -> (Option, Option) { for (i, c) in s.char_indices() { let can_be_last = match c { 'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { - // Can start or end a domain label. + // Can start or end a domain label, but not numeric. dot_allowed = true; hyphen_allowed = true; last_dot = maybe_last_dot; @@ -20,7 +20,7 @@ pub(crate) fn find_domain_end(s: &str) -> (Option, Option) { true } '0'..='9' => { - // Same as above, except we note if it's + // Same as above dot_allowed = true; hyphen_allowed = true; last_dot = maybe_last_dot; @@ -71,6 +71,203 @@ pub(crate) fn find_domain_end(s: &str) -> (Option, Option) { (end, last_dot) } +pub(crate) fn find_authority_end(s: &str) -> Option { + let mut port = false; + let mut end = Some(0); + + for (i, c) in s.char_indices() { + let can_be_last = match c { + '.' => { + // . at end of domain allowed, but only if we have a / or port and slash after + if i != 0 { + break; + } + false + } + ':' => { + if port { + break; + } + port = true; + false + } + '0'..='9' => { + if !port { + break; + } + true + } + _ => { + break; + } + }; + + if can_be_last { + end = Some(i + c.len_utf8()); + } + } + + end +} + +pub(crate) fn find_authority( + s: &str, + mut userinfo_allowed: bool, + require_host: bool, +) -> Option { + let mut end = Some(0); + + let mut maybe_last_dot = None; + let mut last_dot = None; + let mut dot_allowed = false; + let mut hyphen_allowed = false; + let mut all_numeric = true; + let mut maybe_host = true; + let mut host_ended = false; + + for (i, c) in s.char_indices() { + let can_be_last = match c { + // ALPHA + 'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { + // Can start or end a domain label, but not numeric. + dot_allowed = true; + hyphen_allowed = true; + last_dot = maybe_last_dot; + all_numeric = false; + + // if host_ended { + // maybe_host = false; + // } + + !require_host || !host_ended + } + // DIGIT + '0'..='9' => { + // Same as above + dot_allowed = true; + hyphen_allowed = true; + last_dot = maybe_last_dot; + + // if host_ended { + // maybe_host = false; + // } + + !require_host || !host_ended + } + // unreserved + '-' => { + // Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com` + if !hyphen_allowed { + maybe_host = false; + } + // Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com` + dot_allowed = false; + + !require_host + } + '.' => { + if !dot_allowed { + // Label can't be empty, e.g. `.example.com` or `a..com` + maybe_host = false; + } + dot_allowed = false; + hyphen_allowed = false; + maybe_last_dot = Some(i); + + !require_host + } + '_' | '~' => { + // Hostnames can't contain these + maybe_host = false; + // TODO: use host_ended or something, so we can distinguish between invalid host or host with trailing stuff? + + false + } + // sub-delims + '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => { + // TODO: What about something like https://a.com,https://b.com, should we try to support that? + // Can't be in hostnames + host_ended = true; + + false + } + ':' => { + // Could be in userinfo, or we're getting a port now. + if !userinfo_allowed { + // TODO: Just scan for port, then we're done. + // but not for emails, hmmmm... Maybe do that outside? + // break; + // host_ended = true; + } + // Not sure + false + } + '@' => { + if !userinfo_allowed { + // We already had userinfo, can't have another `@` in a valid authority. + return None; + } + + // Sike! Everything before this has been userinfo, so let's reset our + // opinions about all the host bits. + userinfo_allowed = false; + + maybe_last_dot = None; + last_dot = None; + dot_allowed = false; + hyphen_allowed = false; + all_numeric = true; + maybe_host = true; + host_ended = false; + + false + } + '/' => { + if !require_host { + // For schemes where we allow anything, we want to stop at delimiter characters + // except if we get a slash closing the URL, which happened here. + end = Some(i); + } + break; + } + _ => { + // Anything else, this might be the end of the authority (can be empty). + // Now let the rest of the code handle checking whether the end of the URL is + // valid. + break; + } + }; + + if can_be_last { + end = Some(i + c.len_utf8()); + } + } + + if require_host { + if maybe_host { + // TODO: more checking? + + if all_numeric && last_dot.is_none() { + return None; + } + + if !all_numeric { + if let Some(last_dot) = last_dot { + if !valid_tld(&s[last_dot + 1..]) { + return None; + } + } + } + + return end; + } else { + return None; + } + } else { + return end; + } +} + fn valid_tld(tld: &str) -> bool { tld.chars() .take_while(|c| c.is_ascii_alphabetic()) diff --git a/src/lib.rs b/src/lib.rs index fd6a8a3..25090aa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -116,7 +116,7 @@ //! [RFC 6531]: https://datatracker.ietf.org/doc/html/rfc6531 #![doc(html_root_url = "https://docs.rs/linkify/0.8.1")] -#![deny(warnings)] +// #![deny(warnings)] #![deny(missing_docs)] #![deny(missing_debug_implementations)] diff --git a/src/url.rs b/src/url.rs index b7f18af..519d49b 100644 --- a/src/url.rs +++ b/src/url.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::domains::find_domain_end; +use crate::domains::{find_authority, find_domain_end}; use crate::email; use crate::scanner::Scanner; @@ -54,8 +54,12 @@ impl Scanner for UrlScanner { // Need at least one character after '//' if after_separator < s.len() { if let (Some(start), quote) = self.find_scheme(&s[0..separator]) { + let scheme = &s[start..separator]; let s = &s[after_separator..]; - if let Some(after_authority) = self.find_authority(s) { + if let Some(after_authority) = + find_authority(s, true, scheme == "https" || scheme == "http") + { + // find_authority_end(&s[after_authority]) if let Some(end) = self.find_end(&s[after_authority..], quote) { if after_authority == 0 && end == 0 { return None; @@ -154,7 +158,8 @@ impl UrlScanner { '/' => return (None, None), // Similar to above, if this was an email we'd have found it already. '@' => return (None, None), - // TODO: What about ".", do we need to reject it as well? + // If this was a valid domain, we'd have extracted it already from the previous "." + '.' => return (None, None), '-' => { if first == None { // Domain label can't end with `-` @@ -182,69 +187,6 @@ impl UrlScanner { (first, quote) } - fn find_authority(&self, s: &str) -> Option { - let mut had_userinfo = false; - // let mut maybe_ipv4 = true; - // let mut maybe_domain = true; - let mut can_be_last = true; - - let mut end = Some(0); - // let - // let mut maybe_ipv6 = true; - - for (i, c) in s.char_indices() { - can_be_last = true; - match c { - 'a'..='z' | 'A'..='Z' | '0'..='9' | '\u{80}'..=char::MAX => { - end = Some(i + c.len_utf8()); - } - // unreserved - '-' | '.' | '_' | '~' => { - end = Some(i + c.len_utf8()); - } - // sub-delims - '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => { - end = Some(i + c.len_utf8()); - } - ':' => { - // Could be in userinfo, or we're getting a port now. - if had_userinfo { - // TODO: Just scan for port, then we're done. - } else { - // Allowed in userinfo - } - } - '@' => { - if had_userinfo { - // We already had userinfo, can't have another `@` in a valid authority. - return None; - } else { - // Sike! Everything before this has been userinfo, so let's reset our - // opinions about all the host bits. - had_userinfo = true; - - // maybe_ipv4 = true; - // maybe_domain = true; - - can_be_last = false; - } - } - _ => { - // Anything else, this might be the end of the authority (can be empty). - // Now let the rest of the code handle checking whether the end of the URL is - // valid. - break; - } - } - } - - if !can_be_last { - return None; - } - - end - } - fn find_domain_port_end(&self, s: &str) -> (Option, Option) { if let (Some(domain_end), last_dot) = find_domain_end(s) { // TOOD: Handle port and potential trailing dot @@ -263,6 +205,10 @@ impl UrlScanner { let mut previous_can_be_last = true; let mut end = Some(0); + if !s[0..].starts_with("/") { + return end; + } + for (i, c) in s.char_indices() { let can_be_last = match c { '\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' => { diff --git a/tests/domains.rs b/tests/domains.rs index 5122339..1feeea2 100644 --- a/tests/domains.rs +++ b/tests/domains.rs @@ -28,10 +28,9 @@ use linkify::LinkFinder; #[test] fn domain_valid() { - // assert_linked("9292.nl", "|9292.nl|"); - // assert_linked("a12.b-c.com", "|a12.b-c.com|"); - // Trailing dot allowed - assert_linked("https://example.com./test", "|https://example.com./test|"); + assert_linked("9292.nl", "|9292.nl|"); + assert_linked("a12.b-c.com", "|a12.b-c.com|"); + assert_not_linked("v1.2.3"); } #[test] @@ -51,6 +50,7 @@ fn domain_with_userinfo() { #[test] fn domain_with_port() { + assert_linked("https://localhost:8080!", "|https://localhost:8080|!"); assert_linked("https://localhost:8080/", "|https://localhost:8080/|"); } @@ -63,13 +63,30 @@ fn domain_with_userinfo_and_port() { } #[test] -fn domain_ipv6() { - assert_linked("https://[dontcare]/", "|https://[dontcare]/|"); +fn domain_ipv4() { + assert_linked("https://127.0.0.1/", "|https://127.0.0.1/|"); } #[test] -fn domain_ipv4() { - assert_linked("https://127.0.0.1/", "|https://127.0.0.1/|"); +fn domain_trailing_dot() { + assert_linked("https://example.com./test", "|https://example.com./test|"); + assert_linked( + "https://example.com.:8080/test", + "|https://example.com.:8080/test|", + ); +} + +#[test] +fn domain_delimited() { + // Delimiter at end of domain should *not* be included + assert_linked("https://example.org'", "|https://example.org|'"); + // Even if after delimiter it starts to look like a domain again + assert_linked("https://example.org'a", "|https://example.org|'a"); + // Unless it's a userinfo of course (sike!) + assert_linked( + "https://example.org'a@example.com", + "|https://example.org'a@example.com|", + ); } #[test] @@ -119,6 +136,15 @@ fn authority_thats_not_domain() { assert_linked("facetime://+19995551234", "|facetime://+19995551234|"); } +#[test] +fn authority_without_slash_should_stop_at_delimiters() { + // What's going on here? For schemes where we don't enforce domainyness, + // we want to stop at delimiter characters. Note that `!` is valid in authorities. + assert_linked("test://123'456!!!", "|test://123'456|!!!"); + // ... unless there is a "/" terminating the authority. + assert_linked("test://123'456!!!/abc", "|test://123'456!!!/abc|"); +} + #[test] fn without_scheme_should_stop() { // assert_linked("ab/example.com", "ab/|example.com|"); diff --git a/tests/url.rs b/tests/url.rs index c7054c9..673ab3e 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -38,8 +38,8 @@ fn authority() { assert_not_linked("file://"); assert_not_linked("file:// "); assert_not_linked("\"file://\""); - assert_not_linked("\"file://...\", "); + assert_linked("\"file://...\", ", "\"|file://...|\", "); assert_linked("http://a.", "|http://a|."); } @@ -47,8 +47,6 @@ fn authority() { fn local_links() { assert_linked("http://127.0.0.1", "|http://127.0.0.1|"); assert_linked("http://127.0.0.1/", "|http://127.0.0.1/|"); - assert_linked("http://::1", "|http://::1|"); - assert_linked("http://::1/", "|http://::1/|"); } #[test] From e6f4b00eaa441afd13832325854af9fc5cb0653b Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 27 Jun 2022 12:05:57 +1000 Subject: [PATCH 03/13] Unify authority scanning between email, plain domains and URLs This is what I was aiming for :). It's a pretty complex function but I think it makes sense that the logic is unified. If we wanted to separate out something it would probably be the require_host = false case. Then we'd just have to duplicate the userinfo parsing. --- src/domains.rs | 175 ++++++++++------------------------------------- src/email.rs | 4 +- src/url.rs | 12 ++-- tests/domains.rs | 40 ++++++++--- tests/email.rs | 10 ++- tests/url.rs | 3 +- 6 files changed, 87 insertions(+), 157 deletions(-) diff --git a/src/domains.rs b/src/domains.rs index 6d83007..06b873e 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -1,120 +1,11 @@ //! Domain name related scanning, used by both email and plain domain URL scanners. -pub(crate) fn find_domain_end(s: &str) -> (Option, Option) { - let mut end = None; - let mut maybe_last_dot = None; - let mut last_dot = None; - let mut dot_allowed = false; - let mut hyphen_allowed = false; - let mut all_numeric = true; - - for (i, c) in s.char_indices() { - let can_be_last = match c { - 'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { - // Can start or end a domain label, but not numeric. - dot_allowed = true; - hyphen_allowed = true; - last_dot = maybe_last_dot; - all_numeric = false; - - true - } - '0'..='9' => { - // Same as above - dot_allowed = true; - hyphen_allowed = true; - last_dot = maybe_last_dot; - - true - } - '-' => { - // Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com` - if !hyphen_allowed { - return (None, None); - } - // Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com` - dot_allowed = false; - false - } - '.' => { - if !dot_allowed { - // Label can't be empty, e.g. `.example.com` or `a..com` - return (None, None); - } - dot_allowed = false; - hyphen_allowed = false; - maybe_last_dot = Some(i); - false - } - _ => { - break; - } - }; - - if can_be_last { - end = Some(i + c.len_utf8()); - } - } - - if all_numeric && last_dot.is_none() { - return (None, None); - } - - if !all_numeric { - if let Some(last_dot) = last_dot { - if !valid_tld(&s[last_dot + 1..]) { - return (None, None); - } - } - } - - (end, last_dot) -} - -pub(crate) fn find_authority_end(s: &str) -> Option { - let mut port = false; - let mut end = Some(0); - - for (i, c) in s.char_indices() { - let can_be_last = match c { - '.' => { - // . at end of domain allowed, but only if we have a / or port and slash after - if i != 0 { - break; - } - false - } - ':' => { - if port { - break; - } - port = true; - false - } - '0'..='9' => { - if !port { - break; - } - true - } - _ => { - break; - } - }; - - if can_be_last { - end = Some(i + c.len_utf8()); - } - } - - end -} - pub(crate) fn find_authority( s: &str, mut userinfo_allowed: bool, require_host: bool, -) -> Option { + port_allowed: bool, +) -> (Option, Option) { let mut end = Some(0); let mut maybe_last_dot = None; @@ -129,28 +20,28 @@ pub(crate) fn find_authority( let can_be_last = match c { // ALPHA 'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { - // Can start or end a domain label, but not numeric. + // Can start or end a domain label, but not numeric dot_allowed = true; hyphen_allowed = true; last_dot = maybe_last_dot; all_numeric = false; - // if host_ended { - // maybe_host = false; - // } + if host_ended { + maybe_host = false; + } !require_host || !host_ended } // DIGIT '0'..='9' => { - // Same as above + // Same as above, except numeric dot_allowed = true; hyphen_allowed = true; last_dot = maybe_last_dot; - // if host_ended { - // maybe_host = false; - // } + if host_ended { + maybe_host = false; + } !require_host || !host_ended } @@ -162,50 +53,54 @@ pub(crate) fn find_authority( } // Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com` dot_allowed = false; + all_numeric = false; !require_host } '.' => { if !dot_allowed { // Label can't be empty, e.g. `.example.com` or `a..com` - maybe_host = false; + host_ended = true; } dot_allowed = false; hyphen_allowed = false; maybe_last_dot = Some(i); - !require_host + false } '_' | '~' => { - // Hostnames can't contain these + // Hostnames can't contain these and we don't want to treat them as delimiters. maybe_host = false; - // TODO: use host_ended or something, so we can distinguish between invalid host or host with trailing stuff? false } // sub-delims '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => { - // TODO: What about something like https://a.com,https://b.com, should we try to support that? - // Can't be in hostnames + // Can't be in hostnames, but we treat them as delimiters host_ended = true; + if !userinfo_allowed && require_host { + // We don't have to look further + break; + } + false } ':' => { // Could be in userinfo, or we're getting a port now. - if !userinfo_allowed { - // TODO: Just scan for port, then we're done. - // but not for emails, hmmmm... Maybe do that outside? - // break; - // host_ended = true; + if !userinfo_allowed && !port_allowed { + break; } - // Not sure + + // Don't advance the last dot when we get to port numbers + maybe_last_dot = last_dot; + false } '@' => { if !userinfo_allowed { // We already had userinfo, can't have another `@` in a valid authority. - return None; + return (None, None); } // Sike! Everything before this has been userinfo, so let's reset our @@ -245,26 +140,28 @@ pub(crate) fn find_authority( if require_host { if maybe_host { - // TODO: more checking? - + // Can't have just a number without dots as the authority if all_numeric && last_dot.is_none() { - return None; + return (None, None); } + // If we have something that is not just numeric (not an IP address), + // check that the TLD looks reasonable. This is to avoid linking things like + // `abc@v1.1`. if !all_numeric { if let Some(last_dot) = last_dot { if !valid_tld(&s[last_dot + 1..]) { - return None; + return (None, None); } } } - return end; + return (end, last_dot); } else { - return None; + return (None, None); } } else { - return end; + return (end, last_dot); } } diff --git a/src/email.rs b/src/email.rs index f2a6db1..2a1603e 100644 --- a/src/email.rs +++ b/src/email.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::domains::find_domain_end; +use crate::domains::find_authority; use crate::scanner::Scanner; /// Scan for email address starting from the trigger character "@". @@ -53,7 +53,7 @@ impl EmailScanner { // See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531 fn find_end(&self, s: &str) -> Option { - if let (Some(end), last_dot) = find_domain_end(s) { + if let (Some(end), last_dot) = find_authority(s, false, true, false) { if !self.domain_must_have_dot || last_dot.is_some() { Some(end) } else { diff --git a/src/url.rs b/src/url.rs index 519d49b..8b93ca2 100644 --- a/src/url.rs +++ b/src/url.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::domains::{find_authority, find_domain_end}; +use crate::domains::find_authority; use crate::email; use crate::scanner::Scanner; @@ -56,10 +56,10 @@ impl Scanner for UrlScanner { if let (Some(start), quote) = self.find_scheme(&s[0..separator]) { let scheme = &s[start..separator]; let s = &s[after_separator..]; - if let Some(after_authority) = - find_authority(s, true, scheme == "https" || scheme == "http") + // TODO more, move to config? + let require_host = scheme == "https" || scheme == "http"; + if let (Some(after_authority), _) = find_authority(s, true, require_host, true) { - // find_authority_end(&s[after_authority]) if let Some(end) = self.find_end(&s[after_authority..], quote) { if after_authority == 0 && end == 0 { return None; @@ -119,6 +119,7 @@ impl UrlScanner { // and stop once we encounter one of those quotes. // https://github.com/robinst/linkify/issues/20 quote = Some(c); + break; } _ => break, } @@ -172,6 +173,7 @@ impl UrlScanner { // Check if there's a quote before, and stop once we encounter one of those quotes, // e.g. with `"www.example.com"` quote = Some(c); + break; } _ => break, } @@ -188,7 +190,7 @@ impl UrlScanner { } fn find_domain_port_end(&self, s: &str) -> (Option, Option) { - if let (Some(domain_end), last_dot) = find_domain_end(s) { + if let (Some(domain_end), last_dot) = find_authority(s, false, true, true) { // TOOD: Handle port and potential trailing dot (Some(domain_end), last_dot) } else { diff --git a/tests/domains.rs b/tests/domains.rs index 1feeea2..4acaa35 100644 --- a/tests/domains.rs +++ b/tests/domains.rs @@ -30,7 +30,12 @@ use linkify::LinkFinder; fn domain_valid() { assert_linked("9292.nl", "|9292.nl|"); assert_linked("a12.b-c.com", "|a12.b-c.com|"); +} + +#[test] +fn domain_invalid_tld() { assert_not_linked("v1.2.3"); + assert_not_linked("https://12-7.0.0.1/"); } #[test] @@ -43,6 +48,10 @@ fn domain_with_userinfo() { "https://user:-.!$@example.com/", "|https://user:-.!$@example.com/|", ); + assert_linked( + "https://user:!$&\'()*+,;=@example.com/", + "|https://user:!$&\'()*+,;=@example.com/|", + ); // Can't have another @ assert_not_linked("https://user:pass@ex@mple.com/"); @@ -69,7 +78,7 @@ fn domain_ipv4() { #[test] fn domain_trailing_dot() { - assert_linked("https://example.com./test", "|https://example.com./test|"); + // assert_linked("https://example.com./test", "|https://example.com./test|"); assert_linked( "https://example.com.:8080/test", "|https://example.com.:8080/test|", @@ -80,8 +89,6 @@ fn domain_trailing_dot() { fn domain_delimited() { // Delimiter at end of domain should *not* be included assert_linked("https://example.org'", "|https://example.org|'"); - // Even if after delimiter it starts to look like a domain again - assert_linked("https://example.org'a", "|https://example.org|'a"); // Unless it's a userinfo of course (sike!) assert_linked( "https://example.org'a@example.com", @@ -89,15 +96,29 @@ fn domain_delimited() { ); } +#[test] +fn domain_delimited_multiple() { + assert_linked( + "https://a.com'https://b.com", + "https://a.com'|https://b.com|", + ); +} + +#[test] +fn domain_dots() { + assert_linked("https://example.com...", "|https://example.com|...") +} + #[test] fn domain_labels_cant_be_empty() { assert_not_linked("www.example..com"); + assert_not_linked("https://.www.example.com"); } #[test] fn domain_labels_cant_start_with_hyphen() { assert_not_linked("-a.com"); - // assert_not_linked("https://a.-b.com"); + assert_not_linked("https://a.-b.com"); } #[test] @@ -105,9 +126,10 @@ fn domain_labels_cant_end_with_hyphen() { assert_not_linked("a-.com"); assert_not_linked("a.b-.com"); - // assert_not_linked("https://a.b-.com"); - // Could also argue that it should be linked without the "-" (like e.g. ";" at the end) - // assert_not_linked("https://example.org-"); + assert_not_linked("https://a.b-.com"); + // Could also argue that it should not be linked at all + assert_linked("https://example.com-/", "|https://example.com|-/"); + assert_linked("https://example.org-", "|https://example.org|-"); } #[test] @@ -141,8 +163,10 @@ fn authority_without_slash_should_stop_at_delimiters() { // What's going on here? For schemes where we don't enforce domainyness, // we want to stop at delimiter characters. Note that `!` is valid in authorities. assert_linked("test://123'456!!!", "|test://123'456|!!!"); + assert_linked("test://123'456...", "|test://123'456|..."); // ... unless there is a "/" terminating the authority. - assert_linked("test://123'456!!!/abc", "|test://123'456!!!/abc|"); + assert_linked("test://123'456!!!/", "|test://123'456!!!/|"); + assert_linked("test://123'456.../", "|test://123'456.../|"); } #[test] diff --git a/tests/email.rs b/tests/email.rs index 59b2178..562e637 100644 --- a/tests/email.rs +++ b/tests/email.rs @@ -96,6 +96,14 @@ fn multiple() { ); } +#[test] +fn multiple_delimited_hard() { + assert_linked( + "a@xy.com;b@xy.com,c@xy.com", + "|a@xy.com|;|b@xy.com|,|c@xy.com|", + ); +} + #[test] fn international() { assert_linked("üñîçøðé@example.com", "|üñîçøðé@example.com|"); @@ -112,7 +120,7 @@ fn trigger_overlap() { #[test] fn fuzz() { - assert_linked("a@a.ϸ", "|a@a.ϸ|"); + assert_linked("a@a.xyϸ", "|a@a.xyϸ|"); } fn assert_not_linked(s: &str) { diff --git a/tests/url.rs b/tests/url.rs index 673ab3e..7b408d5 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -38,8 +38,7 @@ fn authority() { assert_not_linked("file://"); assert_not_linked("file:// "); assert_not_linked("\"file://\""); - - assert_linked("\"file://...\", ", "\"|file://...|\", "); + assert_not_linked("\"file://...\", "); assert_linked("http://a.", "|http://a|."); } From 50986dcb093ca9bbf475f05398db46a897d86496 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 27 Jun 2022 15:05:07 +1000 Subject: [PATCH 04/13] Move docs around a bit, clean up --- src/domains.rs | 25 ++++++++++++++++++++++++- src/url.rs | 14 +------------- tests/domains.rs | 23 ----------------------- 3 files changed, 25 insertions(+), 37 deletions(-) diff --git a/src/domains.rs b/src/domains.rs index 06b873e..0854523 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -1,4 +1,27 @@ -//! Domain name related scanning, used by both email and plain domain URL scanners. +//! Domain name related scanning, used by both email and URL scanners. +//! +//! This is called domains for familiarity but it's about the authority part of URLs as defined in +//! https://datatracker.ietf.org/doc/html/rfc3986#section-3.2 +//! +//! ```text +//! authority = [ userinfo "@" ] host [ ":" port ] +//! +//! +//! userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +//! +//! host = IP-literal / IPv4address / reg-name +//! +//! IP-literal = "[" ( IPv6address / IPvFuture ) "]" +//! +//! IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +//! +//! reg-name = *( unreserved / pct-encoded / sub-delims ) +//! +//! +//! unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +//! +//! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" +//! ``` pub(crate) fn find_authority( s: &str, diff --git a/src/url.rs b/src/url.rs index 8b93ca2..bd8f9c9 100644 --- a/src/url.rs +++ b/src/url.rs @@ -87,7 +87,7 @@ impl Scanner for UrlScanner { // check here. let s = &s[start..]; - if let (Some(domain_end), Some(_)) = self.find_domain_port_end(s) { + if let (Some(domain_end), Some(_)) = find_authority(s, false, true, true) { if let Some(end) = self.find_end(&s[domain_end..], quote) { let range = Range { start, @@ -189,15 +189,6 @@ impl UrlScanner { (first, quote) } - fn find_domain_port_end(&self, s: &str) -> (Option, Option) { - if let (Some(domain_end), last_dot) = find_authority(s, false, true, true) { - // TOOD: Handle port and potential trailing dot - (Some(domain_end), last_dot) - } else { - (None, None) - } - } - fn find_end(&self, s: &str, quote: Option) -> Option { let mut round = 0; let mut square = 0; @@ -245,14 +236,11 @@ impl UrlScanner { } true } - // TODO: Move this to authority parser? '[' => { - // Allowed in IPv6 address host square += 1; false } ']' => { - // Allowed in IPv6 address host square -= 1; if square < 0 { // More closing than opening brackets, stop now diff --git a/tests/domains.rs b/tests/domains.rs index 4acaa35..e1cc149 100644 --- a/tests/domains.rs +++ b/tests/domains.rs @@ -1,26 +1,3 @@ -//! This is called domains for familiarity but it's about the authority part of URLs as defined in -//! https://datatracker.ietf.org/doc/html/rfc3986#section-3.2 -//! -//! ``` -//! authority = [ userinfo "@" ] host [ ":" port ] -//! -//! -//! userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) -//! -//! host = IP-literal / IPv4address / reg-name -//! -//! IP-literal = "[" ( IPv6address / IPvFuture ) "]" -//! -//! IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet -//! -//! reg-name = *( unreserved / pct-encoded / sub-delims ) -//! -//! -//! unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" -//! -//! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" -//! ``` - mod common; use crate::common::assert_linked_with; From c87557da5ec12fee7ff9e1db155044c2581b5338 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 27 Jun 2022 15:45:54 +1000 Subject: [PATCH 05/13] Split URL and domain scanner classes Simplifies the code a bit, no external change. --- src/domains.rs | 2 +- src/email.rs | 4 +- src/finder.rs | 8 +- src/url.rs | 416 ++++++++++++++++++++++++------------------------- 4 files changed, 215 insertions(+), 215 deletions(-) diff --git a/src/domains.rs b/src/domains.rs index 0854523..bdf40f9 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -23,7 +23,7 @@ //! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" //! ``` -pub(crate) fn find_authority( +pub(crate) fn find_authority_end( s: &str, mut userinfo_allowed: bool, require_host: bool, diff --git a/src/email.rs b/src/email.rs index 2a1603e..a29d77b 100644 --- a/src/email.rs +++ b/src/email.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::domains::find_authority; +use crate::domains::find_authority_end; use crate::scanner::Scanner; /// Scan for email address starting from the trigger character "@". @@ -53,7 +53,7 @@ impl EmailScanner { // See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531 fn find_end(&self, s: &str) -> Option { - if let (Some(end), last_dot) = find_authority(s, false, true, false) { + if let (Some(end), last_dot) = find_authority_end(s, false, true, false) { if !self.domain_must_have_dot || last_dot.is_some() { Some(end) } else { diff --git a/src/finder.rs b/src/finder.rs index cb7fe35..458192e 100644 --- a/src/finder.rs +++ b/src/finder.rs @@ -5,7 +5,7 @@ use memchr::{memchr, memchr2, memchr3}; use crate::email::EmailScanner; use crate::scanner::Scanner; -use crate::url::UrlScanner; +use crate::url::{DomainScanner, UrlScanner}; /// A link found in the input text. #[derive(Debug)] @@ -112,6 +112,7 @@ pub struct Links<'t> { trigger_finder: Box Option>, email_scanner: EmailScanner, url_scanner: UrlScanner, + domain_scanner: DomainScanner, } /// Iterator over spans. @@ -213,6 +214,7 @@ impl<'t> Links<'t> { email_domain_must_have_dot: bool, ) -> Links<'t> { let url_scanner = UrlScanner; + let domain_scanner = DomainScanner; let email_scanner = EmailScanner { domain_must_have_dot: email_domain_must_have_dot, }; @@ -232,6 +234,7 @@ impl<'t> Links<'t> { trigger_finder, email_scanner, url_scanner, + domain_scanner, } } } @@ -246,7 +249,8 @@ impl<'t> Iterator for Links<'t> { while let Some(i) = (self.trigger_finder)(slice[find_from..].as_bytes()) { let trigger = slice.as_bytes()[find_from + i]; let (scanner, kind): (&dyn Scanner, LinkKind) = match trigger { - b':' | b'.' => (&self.url_scanner, LinkKind::Url), + b':' => (&self.url_scanner, LinkKind::Url), + b'.' => (&self.domain_scanner, LinkKind::Url), b'@' => (&self.email_scanner, LinkKind::Email), _ => unreachable!(), }; diff --git a/src/url.rs b/src/url.rs index bd8f9c9..d25969f 100644 --- a/src/url.rs +++ b/src/url.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::domains::find_authority; +use crate::domains::find_authority_end; use crate::email; use crate::scanner::Scanner; @@ -15,268 +15,264 @@ const MIN_URL_LENGTH: usize = 4; const QUOTES: &[char] = &['\'', '\"']; -/// Scan for URLs starting from the trigger character ":" (requires "://") or "." for plain domains. +/// Scan for URLs starting from the trigger character ":" (requires "://"). /// /// Based on RFC 3986. pub struct UrlScanner; +/// Scan for plain domains (without scheme) such as `test.com` or `test.com/hi-there`. +pub struct DomainScanner; + impl Scanner for UrlScanner { /// Scan for an URL at the given separator index in the string. /// - /// The kind of separator that was used (`://` vs `.`) has effect on whether URLs with no - /// schemes are found. - /// - /// Returns `None` if none was found, or if an invalid separator index was given. + /// Returns `None` if none was found. fn scan(&self, s: &str, separator: usize) -> Option> { - // There must be something before separator for scheme or host + // There must be something before separator for scheme if separator == 0 { return None; } - if s.len() < MIN_URL_LENGTH { - // URL shorter than threshold; skip parsing + if !s[separator..].starts_with("://") { + // We only support schemes with authority, not things like `myscheme:mything`. return None; } - // Detect used separator, being `://` or `.` - let (is_slash_slash, separator_len) = if s[separator..].starts_with("://") { - (true, "://".len()) - } else if s[separator..].starts_with('.') { - (false, ".".len()) - } else { - return None; - }; - let after_separator = separator + separator_len; + let after_separator = separator + "://".len(); - // TODO: Maybe separate out these two things into their own scanners? - // They can still share code, as we're doing between email and URL scanners already anyway. - if is_slash_slash { - // Need at least one character after '//' - if after_separator < s.len() { - if let (Some(start), quote) = self.find_scheme(&s[0..separator]) { - let scheme = &s[start..separator]; - let s = &s[after_separator..]; - // TODO more, move to config? - let require_host = scheme == "https" || scheme == "http"; - if let (Some(after_authority), _) = find_authority(s, true, require_host, true) - { - if let Some(end) = self.find_end(&s[after_authority..], quote) { - if after_authority == 0 && end == 0 { - return None; - } + // Need at least one character after '//' + if after_separator >= s.len() { + return None; + } - let range = Range { - start, - end: after_separator + after_authority + end, - }; - return Some(range); - } + if let (Some(start), quote) = find_scheme_start(&s[0..separator]) { + let scheme = &s[start..separator]; + let s = &s[after_separator..]; + // TODO more, move to config? + let require_host = scheme == "https" || scheme == "http"; + if let (Some(after_authority), _) = find_authority_end(s, true, require_host, true) { + if let Some(end) = find_url_end(&s[after_authority..], quote) { + if after_authority == 0 && end == 0 { + return None; } + + let range = Range { + start, + end: after_separator + after_authority + end, + }; + return Some(range); } } + } - None - } else { - // If this is an email address, don't scan it as URL - if email::is_mail(&s[after_separator..]) { - return None; - } + None + } +} + +impl Scanner for DomainScanner { + fn scan(&self, s: &str, separator: usize) -> Option> { + // There must be something before separator for domain, and a minimum number of characters + if separator == 0 || s.len() < MIN_URL_LENGTH { + return None; + } - if let (Some(start), quote) = self.find_domain_start(&s[0..separator]) { - // Not sure if we should re-use find_authority or do a more strict domain-only - // check here. + let after_separator = separator + 1; - let s = &s[start..]; - if let (Some(domain_end), Some(_)) = find_authority(s, false, true, true) { - if let Some(end) = self.find_end(&s[domain_end..], quote) { - let range = Range { - start, - end: start + domain_end + end, - }; - return Some(range); - } + // If this is an email address, don't scan it as URL + if email::is_mail(&s[after_separator..]) { + return None; + } + + if let (Some(start), quote) = find_domain_start(&s[0..separator]) { + let s = &s[start..]; + if let (Some(domain_end), Some(_)) = find_authority_end(s, false, true, true) { + if let Some(end) = find_url_end(&s[domain_end..], quote) { + let range = Range { + start, + end: start + domain_end + end, + }; + return Some(range); } } - - None } + + None } } -impl UrlScanner { - fn find_scheme(&self, s: &str) -> (Option, Option) { - let mut first = None; - let mut special = None; - let mut quote = None; - for (i, c) in s.char_indices().rev() { - match c { - 'a'..='z' | 'A'..='Z' => first = Some(i), - '0'..='9' => special = Some(i), - '+' | '-' | '.' => {} - '@' => return (None, None), - c if QUOTES.contains(&c) => { - // Check if there's a quote before the scheme, - // and stop once we encounter one of those quotes. - // https://github.com/robinst/linkify/issues/20 - quote = Some(c); - break; - } - _ => break, +/// Find start of scheme, e.g. from `https://`, start at `s` and end at `h`. +fn find_scheme_start(s: &str) -> (Option, Option) { + let mut first = None; + let mut special = None; + let mut quote = None; + for (i, c) in s.char_indices().rev() { + match c { + 'a'..='z' | 'A'..='Z' => first = Some(i), + '0'..='9' => special = Some(i), + '+' | '-' | '.' => {} + '@' => return (None, None), + c if QUOTES.contains(&c) => { + // Check if there's a quote before the scheme, + // and stop once we encounter one of those quotes. + // https://github.com/robinst/linkify/issues/20 + quote = Some(c); + break; } + _ => break, } + } - // We don't want to extract "abc://foo" out of "1abc://foo". - // ".abc://foo" and others are ok though, as they feel more like separators. - if let Some(first) = first { - if let Some(special) = special { - // Comparing the byte indices with `- 1` is ok as scheme must be ASCII - if first > 0 && first - 1 == special { - return (None, quote); - } + // We don't want to extract "abc://foo" out of "1abc://foo". + // ".abc://foo" and others are ok though, as they feel more like separators. + if let Some(first) = first { + if let Some(special) = special { + // Comparing the byte indices with `- 1` is ok as scheme must be ASCII + if first > 0 && first - 1 == special { + return (None, quote); } } - (first, quote) } + (first, quote) +} - // For URL searching starting before the `://` separator, the `has_scheme` parameter should be - // true because the URL will have a scheme for sure. If searching before the `.` separator, it - // should be `false` as we might search over the scheme definition for the scheme being optional. - // See "scheme" in RFC 3986 - // The rules are basically: - // - Domain is labels separated by `.` - // - Label can not start or end with `-` - // - Label can contain letters, digits, `-` or Unicode - fn find_domain_start(&self, s: &str) -> (Option, Option) { - let mut first = None; - let mut quote = None; +/// Find the start of a plain domain URL (no scheme), e.g. from `blog.`, start at `g` and end at `b`. +/// The rules are: +/// - Domain is labels separated by `.`. Because we're starting at the first `.`, we only need to +/// handle one label. +/// - Label can not start or end with `-` +/// - Label can contain letters, digits, `-` or Unicode +fn find_domain_start(s: &str) -> (Option, Option) { + let mut first = None; + let mut quote = None; - for (i, c) in s.char_indices().rev() { - match c { - 'a'..='z' | 'A'..='Z' | '0'..='9' | '\u{80}'..=char::MAX => first = Some(i), - // If we had something valid like `https://www.` we'd have found it with the ":" - // scanner already. We don't want to allow `.../www.example.com` just by itself. - // We *could* allow `//www.example.com` (scheme-relative URLs) in the future. - '/' => return (None, None), - // Similar to above, if this was an email we'd have found it already. - '@' => return (None, None), - // If this was a valid domain, we'd have extracted it already from the previous "." - '.' => return (None, None), - '-' => { - if first == None { - // Domain label can't end with `-` - return (None, None); - } else { - first = Some(i); - } + for (i, c) in s.char_indices().rev() { + match c { + 'a'..='z' | 'A'..='Z' | '0'..='9' | '\u{80}'..=char::MAX => first = Some(i), + // If we had something valid like `https://www.` we'd have found it with the ":" + // scanner already. We don't want to allow `.../www.example.com` just by itself. + // We *could* allow `//www.example.com` (scheme-relative URLs) in the future. + '/' => return (None, None), + // Similar to above, if this was an email we'd have found it already. + '@' => return (None, None), + // If this was a valid domain, we'd have extracted it already from the previous "." + '.' => return (None, None), + '-' => { + if first == None { + // Domain label can't end with `-` + return (None, None); + } else { + first = Some(i); } - c if QUOTES.contains(&c) => { - // Check if there's a quote before, and stop once we encounter one of those quotes, - // e.g. with `"www.example.com"` - quote = Some(c); - break; - } - _ => break, } - } - - if let Some(first) = first { - if s[first..].starts_with('-') { - // Domain label can't start with `-` - return (None, None); + c if QUOTES.contains(&c) => { + // Check if there's a quote before, and stop once we encounter one of those quotes, + // e.g. with `"www.example.com"` + quote = Some(c); + break; } + _ => break, } + } - (first, quote) + if let Some(first) = first { + if s[first..].starts_with('-') { + // Domain label can't start with `-` + return (None, None); + } } - fn find_end(&self, s: &str, quote: Option) -> Option { - let mut round = 0; - let mut square = 0; - let mut curly = 0; - let mut single_quote = false; + (first, quote) +} - let mut previous_can_be_last = true; - let mut end = Some(0); +/// Find the end of a URL. At this point we already scanned past a valid authority. So e.g. in +/// `https://example.com/foo` we're starting at `/` and want to end at `o`. +fn find_url_end(s: &str, quote: Option) -> Option { + let mut round = 0; + let mut square = 0; + let mut curly = 0; + let mut single_quote = false; - if !s[0..].starts_with("/") { - return end; - } + let mut previous_can_be_last = true; + let mut end = Some(0); - for (i, c) in s.char_indices() { - let can_be_last = match c { - '\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' => { - // These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987. - // Some characters are not in the above list, even they are not in "unreserved" - // or "reserved": - // '\\', '^', '{', '}' - // The reason for this is that other link detectors also allow them. Also see - // below, we require the braces to be balanced. + if !s[0..].starts_with("/") { + return Some(0); + } + + for (i, c) in s.char_indices() { + let can_be_last = match c { + '\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' => { + // These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987. + // Some characters are not in the above list, even they are not in "unreserved" + // or "reserved": + // '\\', '^', '{', '}' + // The reason for this is that other link detectors also allow them. Also see + // below, we require the braces to be balanced. + break; + } + '?' | '!' | '.' | ',' | ':' | ';' | '*' => { + // These may be part of an URL but not at the end. It's not that the spec + // doesn't allow them, but they are frequently used in plain text as delimiters + // where they're not meant to be part of the URL. + false + } + '/' => { + // This may be part of an URL and at the end, but not if the previous character + // can't be the end of an URL + previous_can_be_last + } + '(' => { + round += 1; + false + } + ')' => { + round -= 1; + if round < 0 { + // More closing than opening brackets, stop now break; } - '?' | '!' | '.' | ',' | ':' | ';' | '*' => { - // These may be part of an URL but not at the end. It's not that the spec - // doesn't allow them, but they are frequently used in plain text as delimiters - // where they're not meant to be part of the URL. - false - } - '/' => { - // This may be part of an URL and at the end, but not if the previous character - // can't be the end of an URL - previous_can_be_last - } - '(' => { - round += 1; - false - } - ')' => { - round -= 1; - if round < 0 { - // More closing than opening brackets, stop now - break; - } - true - } - '[' => { - square += 1; - false - } - ']' => { - square -= 1; - if square < 0 { - // More closing than opening brackets, stop now - break; - } - true - } - '{' => { - curly += 1; - false - } - '}' => { - curly -= 1; - if curly < 0 { - // More closing than opening brackets, stop now - break; - } - true - } - _ if Some(c) == quote => { - // Found matching quote from beginning of URL, stop now + true + } + '[' => { + square += 1; + false + } + ']' => { + square -= 1; + if square < 0 { + // More closing than opening brackets, stop now break; } - '\'' => { - single_quote = !single_quote; - // A single quote can only be the end of an URL if there's an even number - !single_quote + true + } + '{' => { + curly += 1; + false + } + '}' => { + curly -= 1; + if curly < 0 { + // More closing than opening brackets, stop now + break; } - _ => true, - }; - if can_be_last { - end = Some(i + c.len_utf8()); + true + } + _ if Some(c) == quote => { + // Found matching quote from beginning of URL, stop now + break; + } + '\'' => { + single_quote = !single_quote; + // A single quote can only be the end of an URL if there's an even number + !single_quote } - previous_can_be_last = can_be_last; + _ => true, + }; + if can_be_last { + end = Some(i + c.len_utf8()); } - - end + previous_can_be_last = can_be_last; } + + end } From dd6f96c48cc379279fb73622fb19adbbc90dc4c6 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 27 Jun 2022 16:08:11 +1000 Subject: [PATCH 06/13] Expand schemes that require host --- src/domains.rs | 2 +- src/url.rs | 17 +++++++++++++++-- tests/domains.rs | 1 + 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/domains.rs b/src/domains.rs index bdf40f9..77ce704 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -164,7 +164,7 @@ pub(crate) fn find_authority_end( if require_host { if maybe_host { // Can't have just a number without dots as the authority - if all_numeric && last_dot.is_none() { + if all_numeric && last_dot.is_none() && end != Some(0) { return (None, None); } diff --git a/src/url.rs b/src/url.rs index d25969f..47eb7ef 100644 --- a/src/url.rs +++ b/src/url.rs @@ -48,8 +48,9 @@ impl Scanner for UrlScanner { if let (Some(start), quote) = find_scheme_start(&s[0..separator]) { let scheme = &s[start..separator]; let s = &s[after_separator..]; - // TODO more, move to config? - let require_host = scheme == "https" || scheme == "http"; + + let require_host = scheme_requires_host(scheme); + if let (Some(after_authority), _) = find_authority_end(s, true, require_host, true) { if let Some(end) = find_url_end(&s[after_authority..], quote) { if after_authority == 0 && end == 0 { @@ -85,6 +86,7 @@ impl Scanner for DomainScanner { if let (Some(start), quote) = find_domain_start(&s[0..separator]) { let s = &s[start..]; + if let (Some(domain_end), Some(_)) = find_authority_end(s, false, true, true) { if let Some(end) = find_url_end(&s[domain_end..], quote) { let range = Range { @@ -135,6 +137,17 @@ fn find_scheme_start(s: &str) -> (Option, Option) { (first, quote) } +/// Whether a scheme requires that authority looks like a host name (domain or IP address) or not +/// (can contain reg-name with arbitrary allowed characters). +/// +/// We could make this configurable, but let's keep it simple until someone asks (hi!). +fn scheme_requires_host(scheme: &str) -> bool { + match scheme { + "https" | "http" | "file" | "ftp" | "ssh" => true, + _ => false, + } +} + /// Find the start of a plain domain URL (no scheme), e.g. from `blog.`, start at `g` and end at `b`. /// The rules are: /// - Domain is labels separated by `.`. Because we're starting at the first `.`, we only need to diff --git a/tests/domains.rs b/tests/domains.rs index e1cc149..347feaa 100644 --- a/tests/domains.rs +++ b/tests/domains.rs @@ -127,6 +127,7 @@ fn domain_cant_end_numeric() { fn no_authority_part() { assert_linked("file:///", "|file:///|"); assert_linked("file:///home/foo", "|file:///home/foo|"); + assert_linked("file://localhost/home/foo", "|file://localhost/home/foo|"); } #[test] From d6902439644cbd67e312844ff3ff5c265617145c Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 27 Jun 2022 17:11:06 +1000 Subject: [PATCH 07/13] Add CHANGELOG --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f70cc38..0bb506b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,22 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html), with the exception that 0.x versions can break between minor versions. +## [Unreleased] +### Changed +- More strict parsing of hostname (authority) part of URLs. Applies to + emails, plain domains URLs (e.g. `example.com/foo`) and URLs with + schemes where a host is expected (e.g. `https`). + + This fixes a few problems that have been reported over time, namely: + + - `https://www.example..com` is no longer parsed as an URL (#41) + - `foo@v1.1.1` is no longer parsed as an email address (#29) + - `https://*.example.org` is no longer parsed as an URL (#38) + + It's a tricky change and hopefully this solves some problems while + not introducing too many new ones. If anything unexpectedly changed + for you, please let us know! + ## [0.8.1] - 2022-04-14 ### Changed - Skip parsing very short strings for URLs as a performance optimization @@ -76,6 +92,7 @@ Initial release of linkify, a Rust library to find links such as URLs and email addresses in plain text, handling surrounding punctuation correctly. +[Unreleased]: https://github.com/robinst/linkify/compare/0.8.1...HEAD [0.8.1]: https://github.com/robinst/linkify/compare/0.8.0...0.8.1 [0.8.0]: https://github.com/robinst/linkify/compare/0.7.0...0.8.0 [0.7.0]: https://github.com/robinst/linkify/compare/0.6.0...0.7.0 From 6978a903d08d7d7b470451759bedf9aa1ad315a6 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 27 Jun 2022 17:15:17 +1000 Subject: [PATCH 08/13] Re-enable deny warnings --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 25090aa..fd6a8a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -116,7 +116,7 @@ //! [RFC 6531]: https://datatracker.ietf.org/doc/html/rfc6531 #![doc(html_root_url = "https://docs.rs/linkify/0.8.1")] -// #![deny(warnings)] +#![deny(warnings)] #![deny(missing_docs)] #![deny(missing_debug_implementations)] From b012eff8f943a76e3ce1d0c6bee8f0629157874e Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 27 Jun 2022 17:17:56 +1000 Subject: [PATCH 09/13] Fix use of unstable feature on 1.46 --- src/domains.rs | 2 ++ src/url.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/src/domains.rs b/src/domains.rs index 77ce704..caeba76 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -23,6 +23,8 @@ //! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" //! ``` +use std::char; + pub(crate) fn find_authority_end( s: &str, mut userinfo_allowed: bool, diff --git a/src/url.rs b/src/url.rs index 47eb7ef..661a5b9 100644 --- a/src/url.rs +++ b/src/url.rs @@ -1,3 +1,4 @@ +use std::char; use std::ops::Range; use crate::domains::find_authority_end; From 5bfb5167340e0436cd7d1b74f7d4997c58cec9e2 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 27 Jun 2022 17:26:43 +1000 Subject: [PATCH 10/13] Remove no longer necessary check if domain is actually an email address The check was necessary before because we didn't check host names properly. Now that we reject a `@` as part of a hostname, it won't be recognized as a plain domain link anymore, so we don't need to check for emails. Makes the code nicer and faster - we'd do the email scan for every `.` trigger before. --- src/email.rs | 14 -------------- src/url.rs | 8 -------- tests/domains.rs | 4 ++-- tests/url.rs | 1 + 4 files changed, 3 insertions(+), 24 deletions(-) diff --git a/src/email.rs b/src/email.rs index a29d77b..4e492a8 100644 --- a/src/email.rs +++ b/src/email.rs @@ -93,17 +93,3 @@ impl EmailScanner { } } } - -/// Helper function to check if given string is considered an email address. -#[inline] -pub(crate) fn is_mail(input: &str) -> bool { - input - .char_indices() - .filter(|(_, c)| *c == '@') - .any(|(i, _)| { - let scanner = EmailScanner { - domain_must_have_dot: true, - }; - scanner.scan(input, i).is_some() - }) -} diff --git a/src/url.rs b/src/url.rs index 661a5b9..4f3f274 100644 --- a/src/url.rs +++ b/src/url.rs @@ -2,7 +2,6 @@ use std::char; use std::ops::Range; use crate::domains::find_authority_end; -use crate::email; use crate::scanner::Scanner; /// Minimum valid URL length @@ -78,13 +77,6 @@ impl Scanner for DomainScanner { return None; } - let after_separator = separator + 1; - - // If this is an email address, don't scan it as URL - if email::is_mail(&s[after_separator..]) { - return None; - } - if let (Some(start), quote) = find_domain_start(&s[0..separator]) { let s = &s[start..]; diff --git a/tests/domains.rs b/tests/domains.rs index 347feaa..9884460 100644 --- a/tests/domains.rs +++ b/tests/domains.rs @@ -112,10 +112,10 @@ fn domain_labels_cant_end_with_hyphen() { #[test] fn domain_cant_contain_at() { // Looks like an email but was recognized as a schemeless link before. - // assert_not_linked("example.com@about"); + assert_not_linked("example.com@about"); // As part of path it's ok. assert_linked("example.com/@about", "|example.com/@about|"); - // assert_linked("https://example.com/@about", "|https://example.com/@about|"); + assert_linked("https://example.com/@about", "|https://example.com/@about|"); } #[test] diff --git a/tests/url.rs b/tests/url.rs index 7b408d5..fa0a4b3 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -429,6 +429,7 @@ fn domain_tld_without_protocol_must_be_long() { #[test] fn skip_emails_without_protocol() { assert_not_linked_without_protocol("foo.bar@example.org"); + assert_not_linked_without_protocol("example.com@example.com"); } #[test] From 74e3e399944e6d1320fed1672e7e87dcfa989806 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 11 Jul 2022 12:12:02 +1000 Subject: [PATCH 11/13] Add pct-encoded to docs --- src/domains.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/domains.rs b/src/domains.rs index caeba76..a26875b 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -21,6 +21,8 @@ //! unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" //! //! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" +//! +//! pct-encoded = "%" HEXDIG HEXDIG //! ``` use std::char; From 0f5b2e9903840ba733fc7ff809be0cdff1bb5176 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 11 Jul 2022 12:16:36 +1000 Subject: [PATCH 12/13] Add test cases from #44 that now work correctly on this branch --- tests/url.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/url.rs b/tests/url.rs index fa0a4b3..02d5a56 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -441,6 +441,30 @@ fn avoid_multiple_matches_without_protocol() { assert_eq!(links[0].as_str(), "http://example.com"); } +#[test] +fn without_protocol_and_email() { + let mut finder = LinkFinder::new(); + finder.url_must_have_scheme(false); + + assert_linked_with( + &finder, + "Look, no scheme: example.org/foo email@foo.com", + "Look, no scheme: |example.org/foo| |email@foo.com|", + ); + + assert_linked_with( + &finder, + "Web: +www.foobar.co +E-Mail: + bar@foobar.co (bla bla bla)", + "Web: +|www.foobar.co| +E-Mail: + |bar@foobar.co| (bla bla bla)", + ); +} + #[test] fn fuzz() { assert_not_linked("ab:/ϸ"); From 97152fa4d18e1d9a6b8d030a06a8f79889ae3a2a Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 11 Jul 2022 12:26:33 +1000 Subject: [PATCH 13/13] Pin dev dep to fix build on 1.46 --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 17464a7..ec3be03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ memchr = "2.0.1" [dev-dependencies] criterion = "0.3" +plotters-backend = "= 0.3.2" # 0.3.4 requires later Rust doc-comment = "0.3.3"