Merge pull request #43 from robinst/check-domains

More strict parsing of hostname (authority) part of URLs
robinst · Jul 11, 2022 · b6ad06e · b6ad06e
2 parents 9a6ce39 + 97152fa
commit b6ad06e
Show file tree

Hide file tree

Showing 10 changed files with 662 additions and 236 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,22 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html),
 with the exception that 0.x versions can break between minor versions.
 
+## [Unreleased]
+### Changed
+- More strict parsing of hostname (authority) part of URLs. Applies to
+  emails, plain domains URLs (e.g. `example.com/foo`) and URLs with
+  schemes where a host is expected (e.g. `https`).
+
+  This fixes a few problems that have been reported over time, namely:
+
+  - `https://www.example..com` is no longer parsed as an URL (#41)
+  - `foo@v1.1.1` is no longer parsed as an email address (#29)
+  - `https://*.example.org` is no longer parsed as an URL (#38)
+
+  It's a tricky change and hopefully this solves some problems while
+  not introducing too many new ones. If anything unexpectedly changed
+  for you, please let us know!
+
 ## [0.8.1] - 2022-04-14
 ### Changed
 - Skip parsing very short strings for URLs as a performance optimization
@@ -76,6 +92,7 @@ Initial release of linkify, a Rust library to find links such as URLs and email
 addresses in plain text, handling surrounding punctuation correctly.
 
 
+[Unreleased]: https://github.com/robinst/linkify/compare/0.8.1...HEAD
 [0.8.1]: https://github.com/robinst/linkify/compare/0.8.0...0.8.1
 [0.8.0]: https://github.com/robinst/linkify/compare/0.7.0...0.8.0
 [0.7.0]: https://github.com/robinst/linkify/compare/0.6.0...0.7.0

diff --git a/Cargo.toml b/Cargo.toml
@@ -18,6 +18,7 @@ memchr = "2.0.1"
 
 [dev-dependencies]
 criterion = "0.3"
+plotters-backend = "= 0.3.2" # 0.3.4 requires later Rust
 doc-comment = "0.3.3"
 
 

diff --git a/src/domains.rs b/src/domains.rs
@@ -0,0 +1,201 @@
+//! Domain name related scanning, used by both email and URL scanners.
+//!
+//! This is called domains for familiarity but it's about the authority part of URLs as defined in
+//! https://datatracker.ietf.org/doc/html/rfc3986#section-3.2
+//!
+//! ```text
+//! authority   = [ userinfo "@" ] host [ ":" port ]
+//!
+//!
+//! userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
+//!
+//! host        = IP-literal / IPv4address / reg-name
+//!
+//! IP-literal = "[" ( IPv6address / IPvFuture  ) "]"
+//!
+//! IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
+//!
+//! reg-name    = *( unreserved / pct-encoded / sub-delims )
+//!
+//!
+//! unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+//!
+//! sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+//!
+//! pct-encoded = "%" HEXDIG HEXDIG
+//! ```
+
+use std::char;
+
+pub(crate) fn find_authority_end(
+    s: &str,
+    mut userinfo_allowed: bool,
+    require_host: bool,
+    port_allowed: bool,
+) -> (Option<usize>, Option<usize>) {
+    let mut end = Some(0);
+
+    let mut maybe_last_dot = None;
+    let mut last_dot = None;
+    let mut dot_allowed = false;
+    let mut hyphen_allowed = false;
+    let mut all_numeric = true;
+    let mut maybe_host = true;
+    let mut host_ended = false;
+
+    for (i, c) in s.char_indices() {
+        let can_be_last = match c {
+            // ALPHA
+            'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => {
+                // Can start or end a domain label, but not numeric
+                dot_allowed = true;
+                hyphen_allowed = true;
+                last_dot = maybe_last_dot;
+                all_numeric = false;
+
+                if host_ended {
+                    maybe_host = false;
+                }
+
+                !require_host || !host_ended
+            }
+            // DIGIT
+            '0'..='9' => {
+                // Same as above, except numeric
+                dot_allowed = true;
+                hyphen_allowed = true;
+                last_dot = maybe_last_dot;
+
+                if host_ended {
+                    maybe_host = false;
+                }
+
+                !require_host || !host_ended
+            }
+            // unreserved
+            '-' => {
+                // Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com`
+                if !hyphen_allowed {
+                    maybe_host = false;
+                }
+                // Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com`
+                dot_allowed = false;
+                all_numeric = false;
+
+                !require_host
+            }
+            '.' => {
+                if !dot_allowed {
+                    // Label can't be empty, e.g. `.example.com` or `a..com`
+                    host_ended = true;
+                }
+                dot_allowed = false;
+                hyphen_allowed = false;
+                maybe_last_dot = Some(i);
+
+                false
+            }
+            '_' | '~' => {
+                // Hostnames can't contain these and we don't want to treat them as delimiters.
+                maybe_host = false;
+
+                false
+            }
+            // sub-delims
+            '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => {
+                // Can't be in hostnames, but we treat them as delimiters
+                host_ended = true;
+
+                if !userinfo_allowed && require_host {
+                    // We don't have to look further
+                    break;
+                }
+
+                false
+            }
+            ':' => {
+                // Could be in userinfo, or we're getting a port now.
+                if !userinfo_allowed && !port_allowed {
+                    break;
+                }
+
+                // Don't advance the last dot when we get to port numbers
+                maybe_last_dot = last_dot;
+
+                false
+            }
+            '@' => {
+                if !userinfo_allowed {
+                    // We already had userinfo, can't have another `@` in a valid authority.
+                    return (None, None);
+                }
+
+                // Sike! Everything before this has been userinfo, so let's reset our
+                // opinions about all the host bits.
+                userinfo_allowed = false;
+
+                maybe_last_dot = None;
+                last_dot = None;
+                dot_allowed = false;
+                hyphen_allowed = false;
+                all_numeric = true;
+                maybe_host = true;
+                host_ended = false;
+
+                false
+            }
+            '/' => {
+                if !require_host {
+                    // For schemes where we allow anything, we want to stop at delimiter characters
+                    // except if we get a slash closing the URL, which happened here.
+                    end = Some(i);
+                }
+                break;
+            }
+            _ => {
+                // Anything else, this might be the end of the authority (can be empty).
+                // Now let the rest of the code handle checking whether the end of the URL is
+                // valid.
+                break;
+            }
+        };
+
+        if can_be_last {
+            end = Some(i + c.len_utf8());
+        }
+    }
+
+    if require_host {
+        if maybe_host {
+            // Can't have just a number without dots as the authority
+            if all_numeric && last_dot.is_none() && end != Some(0) {
+                return (None, None);
+            }
+
+            // If we have something that is not just numeric (not an IP address),
+            // check that the TLD looks reasonable. This is to avoid linking things like
+            // `abc@v1.1`.
+            if !all_numeric {
+                if let Some(last_dot) = last_dot {
+                    if !valid_tld(&s[last_dot + 1..]) {
+                        return (None, None);
+                    }
+                }
+            }
+
+            return (end, last_dot);
+        } else {
+            return (None, None);
+        }
+    } else {
+        return (end, last_dot);
+    }
+}
+
+fn valid_tld(tld: &str) -> bool {
+    tld.chars()
+        .take_while(|c| c.is_ascii_alphabetic())
+        .take(2)
+        .count()
+        >= 2
+}
diff --git a/src/email.rs b/src/email.rs
@@ -1,5 +1,6 @@
 use std::ops::Range;
 
+use crate::domains::find_authority_end;
 use crate::scanner::Scanner;
 
 /// Scan for email address starting from the trigger character "@".
@@ -40,6 +41,9 @@ impl EmailScanner {
                     break;
                 }
                 atom_boundary = true;
+            } else if c == '@' {
+                // In `@me@a.com`, we don't want to extract `me@a.com`.
+                return None;
             } else {
                 break;
             }
@@ -49,40 +53,8 @@ impl EmailScanner {
 
     // See "Domain" in RFC 5321, plus extension of "sub-domain" in RFC 6531
     fn find_end(&self, s: &str) -> Option<usize> {
-        let mut first_in_sub_domain = true;
-        let mut can_end_sub_domain = false;
-        let mut first_dot = None;
-        let mut end = None;
-
-        for (i, c) in s.char_indices() {
-            if first_in_sub_domain {
-                if Self::sub_domain_allowed(c) {
-                    end = Some(i + c.len_utf8());
-                    first_in_sub_domain = false;
-                    can_end_sub_domain = true;
-                } else {
-                    break;
-                }
-            } else if c == '.' {
-                if !can_end_sub_domain {
-                    break;
-                }
-                first_in_sub_domain = true;
-                if first_dot.is_none() {
-                    first_dot = Some(i);
-                }
-            } else if c == '-' {
-                can_end_sub_domain = false;
-            } else if Self::sub_domain_allowed(c) {
-                end = Some(i + c.len_utf8());
-                can_end_sub_domain = true;
-            } else {
-                break;
-            }
-        }
-
-        if let Some(end) = end {
-            if !self.domain_must_have_dot || first_dot.map(|d| d < end).unwrap_or(false) {
+        if let (Some(end), last_dot) = find_authority_end(s, false, true, false) {
+            if !self.domain_must_have_dot || last_dot.is_some() {
                 Some(end)
             } else {
                 None
@@ -120,27 +92,4 @@ impl EmailScanner {
             _ => c >= '\u{80}',
         }
     }
-
-    // See "sub-domain" in RFC 5321. Extension in RFC 6531 is simplified,
-    // this can also match invalid domains.
-    fn sub_domain_allowed(c: char) -> bool {
-        match c {
-            'a'..='z' | 'A'..='Z' | '0'..='9' => true,
-            _ => c >= '\u{80}',
-        }
-    }
-}
-
-/// Helper function to check if given string is considered an email address.
-#[inline]
-pub(crate) fn is_mail(input: &str) -> bool {
-    input
-        .char_indices()
-        .filter(|(_, c)| *c == '@')
-        .any(|(i, _)| {
-            let scanner = EmailScanner {
-                domain_must_have_dot: true,
-            };
-            scanner.scan(input, i).is_some()
-        })
 }
diff --git a/src/finder.rs b/src/finder.rs
@@ -5,7 +5,7 @@ use memchr::{memchr, memchr2, memchr3};
 
 use crate::email::EmailScanner;
 use crate::scanner::Scanner;
-use crate::url::UrlScanner;
+use crate::url::{DomainScanner, UrlScanner};
 
 /// A link found in the input text.
 #[derive(Debug)]
@@ -112,6 +112,7 @@ pub struct Links<'t> {
     trigger_finder: Box<dyn Fn(&[u8]) -> Option<usize>>,
     email_scanner: EmailScanner,
     url_scanner: UrlScanner,
+    domain_scanner: DomainScanner,
 }
 
 /// Iterator over spans.
@@ -213,6 +214,7 @@ impl<'t> Links<'t> {
         email_domain_must_have_dot: bool,
     ) -> Links<'t> {
         let url_scanner = UrlScanner;
+        let domain_scanner = DomainScanner;
         let email_scanner = EmailScanner {
             domain_must_have_dot: email_domain_must_have_dot,
         };
@@ -232,6 +234,7 @@ impl<'t> Links<'t> {
             trigger_finder,
             email_scanner,
             url_scanner,
+            domain_scanner,
         }
     }
 }
@@ -246,7 +249,8 @@ impl<'t> Iterator for Links<'t> {
         while let Some(i) = (self.trigger_finder)(slice[find_from..].as_bytes()) {
             let trigger = slice.as_bytes()[find_from + i];
             let (scanner, kind): (&dyn Scanner, LinkKind) = match trigger {
-                b':' | b'.' => (&self.url_scanner, LinkKind::Url),
+                b':' => (&self.url_scanner, LinkKind::Url),
+                b'.' => (&self.domain_scanner, LinkKind::Url),
                 b'@' => (&self.email_scanner, LinkKind::Email),
                 _ => unreachable!(),
             };

diff --git a/src/lib.rs b/src/lib.rs
@@ -120,6 +120,7 @@
 #![deny(missing_docs)]
 #![deny(missing_debug_implementations)]
 
+mod domains;
 mod email;
 mod finder;
 mod scanner;