Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(core): Add domain-specific rules as JSON file #1347

Merged
merged 4 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 4 additions & 3 deletions core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,18 @@ async-std-resolver = "0.21.2"
fantoccini = { version = "0.19.3", optional = true }
futures = { version = "0.3.27", optional = true }
fast-socks5 = "0.9.1"
levenshtein = "1.0.5"
log = "0.4.20"
mailchecker = "5.0.7"
md5 = "0.7.0"
once_cell = "1.17.1"
pwned = "0.5.0"
rand = { version = "0.8.5", features = ["small_rng"] }
regex = "1.9.4"
reqwest = { version = "0.11.16", features = ["json", "socks"] }
serde = { version = "1.0.157", features = ["derive"] }
serde_json = "1.0.95"
trust-dns-proto = "0.21.2"
md5 = "0.7.0"
levenshtein = "1.0.5"
pwned = "0.5.0"

[dev-dependencies]
tokio = { version = "1.28.2" }
Expand Down
1 change: 1 addition & 0 deletions core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
mod haveibeenpwned;
pub mod misc;
pub mod mx;
mod rules;
pub mod smtp;
pub mod syntax;
mod util;
Expand Down
19 changes: 19 additions & 0 deletions core/src/rules.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"by_domain": {
"gmail.com": { "rules": ["SkipCatchAll"] },
"hotmail.com": { "rules": ["SkipCatchAll"] },
"hotmail.fr": { "rules": ["SkipCatchAll"] },
"hotmail.nl": { "rules": ["SkipCatchAll"] },
"yahoo.com": { "rules": ["SkipCatchAll"] },
"yahoo.fr": { "rules": ["SkipCatchAll"] }
},
"by_mx_suffix": {
".antispamcloud.com.": {
"rules": ["SkipCatchAll"],
"_comment": "Some <RCPT TO> take exactly 30s to respond, so we skip the catch-all one, and bump the timeout."
}
},
"rules": {
"SkipCatchAll": { "_comment": "Don't perform catch-all check." }
}
}
71 changes: 71 additions & 0 deletions core/src/rules.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// check-if-email-exists
// Copyright (C) 2018-2022 Reacher

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published
// by the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.

// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

//! Read provider- and domain-specific rules from a JSON, then match each
//! email verification to the domain/provider, and translate those rules into
//! code.
//!
//! IMPORTANT: This is still a beta feature, and probably needs refining.

use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

#[derive(Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
pub enum Rule {
/// Don't perform catch-all check.
SkipCatchAll,
}

#[derive(Debug, Deserialize, Serialize)]
struct RulesByDomain {
rules: Vec<Rule>,
}

#[derive(Debug, Deserialize, Serialize)]
struct AllRules {
/// Apply rules by domain name, i.e. after the @ symbol.
by_domain: HashMap<String, RulesByDomain>,
/// Apply rules by the MX host. Since each domain potentially has multiple
/// MX records, we match by their suffix.
by_mx_suffix: HashMap<String, RulesByDomain>,
}

static ALL_RULES: Lazy<AllRules> =
Lazy::new(|| serde_json::from_str::<AllRules>(include_str!("rules.json")).unwrap());

fn does_domain_have_rule(domain: &str, rule: &Rule) -> bool {
if let Some(v) = ALL_RULES.by_domain.get(domain) {
return v.rules.contains(rule);
}

false
}

fn does_mx_have_rule(host: &str, rule: &Rule) -> bool {
for (k, v) in ALL_RULES.by_mx_suffix.iter() {
if host.ends_with(k) {
return v.rules.contains(rule);
}
}

false
}

/// Check if either the domain or the MX host has any given rule.
pub fn has_rule(domain: &str, host: &str, rule: &Rule) -> bool {
does_domain_have_rule(domain, rule) || does_mx_have_rule(host, rule)
}
57 changes: 43 additions & 14 deletions core/src/smtp/connect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ use std::iter;
use std::str::FromStr;
use std::time::Duration;

use trust_dns_proto::rr::Name;

use super::{gmail::is_gmail, outlook::is_hotmail, parser, yahoo::is_yahoo};
use super::parser;
use super::{SmtpDetails, SmtpError};
use crate::util::{constants::LOG_TARGET, input_output::CheckEmailInput};
use crate::{
rules::{has_rule, Rule},
util::{constants::LOG_TARGET, input_output::CheckEmailInput},
};

/// Try to send an smtp command, close and return Err if fails.
macro_rules! try_smtp (
Expand All @@ -48,13 +49,12 @@ macro_rules! try_smtp (

/// Attempt to connect to host via SMTP, and return SMTP client on success.
async fn connect_to_host(
host: &Name,
host: &str,
port: u16,
input: &CheckEmailInput,
) -> Result<SmtpTransport, SmtpError> {
// hostname verification fails if it ends with '.', for example, using
// SOCKS5 proxies we can `io: incomplete` error.
let host = host.to_string();
let host = host.trim_end_matches('.').to_string();

let security = {
Expand Down Expand Up @@ -220,11 +220,16 @@ async fn email_deliverable(
async fn smtp_is_catch_all(
smtp_transport: &mut SmtpTransport,
domain: &str,
host: &Name,
host: &str,
input: &CheckEmailInput,
) -> Result<bool, SmtpError> {
// Skip catch-all check for known providers.
let host = host.to_string();
if is_gmail(&host) || is_hotmail(&host) || is_yahoo(&host) {
if has_rule(domain, host, &Rule::SkipCatchAll) {
log::debug!(
target: LOG_TARGET,
"[email={}] Skipping catch-all check for [domain={domain}]",
input.to_email
);
return Ok(false);
}

Expand All @@ -247,7 +252,7 @@ async fn smtp_is_catch_all(

async fn create_smtp_future(
to_email: &EmailAddress,
host: &Name,
host: &str,
port: u16,
domain: &str,
input: &CheckEmailInput,
Expand All @@ -256,7 +261,7 @@ async fn create_smtp_future(
// Ok(SmtpDetails { can_connect_smtp: false, ... }).
let mut smtp_transport = connect_to_host(host, port, input).await?;

let is_catch_all = smtp_is_catch_all(&mut smtp_transport, domain, host)
let is_catch_all = smtp_is_catch_all(&mut smtp_transport, domain, host, input)
.await
.unwrap_or(false);
let deliverability = if is_catch_all {
Expand All @@ -278,7 +283,8 @@ async fn create_smtp_future(
if parser::is_err_io_errors(e) {
log::debug!(
target: LOG_TARGET,
"Got `io: incomplete` error, reconnecting."
"[email={}] Got `io: incomplete` error, reconnecting.",
input.to_email
);

let _ = smtp_transport.close().await;
Expand All @@ -299,7 +305,7 @@ async fn create_smtp_future(
/// retries.
async fn check_smtp_without_retry(
to_email: &EmailAddress,
host: &Name,
host: &str,
port: u16,
domain: &str,
input: &CheckEmailInput,
Expand All @@ -325,7 +331,7 @@ async fn check_smtp_without_retry(
#[async_recursion]
pub async fn check_smtp_with_retry(
to_email: &EmailAddress,
host: &Name,
host: &str,
port: u16,
domain: &str,
input: &CheckEmailInput,
Expand Down Expand Up @@ -376,3 +382,26 @@ pub async fn check_smtp_with_retry(
_ => result,
}
}

#[cfg(test)]
mod tests {
use super::*;

#[tokio::test]
async fn should_skip_catch_all() {
let smtp_client = SmtpClient::new("gmail.com".into());
let mut smtp_transport = smtp_client.into_transport();

let r = smtp_is_catch_all(
&mut smtp_transport,
"gmail.com",
"alt4.aspmx.l.google.com.",
&CheckEmailInput::default(),
)
.await;

assert!(!smtp_transport.is_connected()); // We shouldn't connect to google servers.
assert!(r.is_ok());
assert_eq!(false, r.unwrap())
}
}
3 changes: 2 additions & 1 deletion core/src/smtp/gmail.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ pub async fn check_gmail(
})
}

/// Check if the MX host is from Gmail.
/// Check if the MX host is from Google, i.e. either a @gmail.com address, or
/// a Google Suite email.
pub fn is_gmail(host: &str) -> bool {
host.to_lowercase().ends_with(".google.com.")
}
Expand Down
18 changes: 7 additions & 11 deletions core/src/smtp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,29 +62,25 @@ pub async fn check_smtp(
domain: &str,
input: &CheckEmailInput,
) -> Result<SmtpDetails, SmtpError> {
let host_lowercase = host.to_lowercase().to_string();
let host = host.to_string();

if input
.skipped_domains
.iter()
.any(|d| host_lowercase.contains(d))
{
if input.skipped_domains.iter().any(|d| host.contains(d)) {
return Err(SmtpError::SkippedDomain(format!(
"Reacher currently cannot verify emails from @{domain}"
)));
}

if input.yahoo_use_api && is_yahoo(&host_lowercase) {
if input.yahoo_use_api && is_yahoo(&host) {
return yahoo::check_yahoo(to_email, input)
.await
.map_err(|err| err.into());
}
if input.gmail_use_api && is_gmail(&host_lowercase) {
if input.gmail_use_api && is_gmail(&host) {
return gmail::check_gmail(to_email, input)
.await
.map_err(|err| err.into());
}
if input.microsoft365_use_api && is_microsoft365(&host_lowercase) {
if input.microsoft365_use_api && is_microsoft365(&host) {
match outlook::microsoft365::check_microsoft365_api(to_email, input).await {
Ok(Some(smtp_details)) => return Ok(smtp_details),
// Continue in the event of an error/ambiguous result.
Expand All @@ -101,14 +97,14 @@ pub async fn check_smtp(
}
#[cfg(feature = "headless")]
if let Some(webdriver) = &input.hotmail_use_headless {
if is_outlook(&host_lowercase) {
if is_outlook(&host) {
return outlook::hotmail::check_password_recovery(to_email, webdriver)
.await
.map_err(|err| err.into());
}
}

check_smtp_with_retry(to_email, host, port, domain, input, input.retries).await
check_smtp_with_retry(to_email, &host, port, domain, input, input.retries).await
}

#[cfg(test)]
Expand Down
Loading