diff --git a/functions/build_lists.sh b/functions/build_lists.sh index 0a64125ff..82aaf5582 100644 --- a/functions/build_lists.sh +++ b/functions/build_lists.sh @@ -1,13 +1,13 @@ #!/bin/bash -# This script builds the various formats of list from the raw files +# This script builds the various formats of list from the raw files. readonly RAW='data/raw.txt' readonly RAW_LIGHT='data/raw_light.txt' -# Function 'build_lists' builds the two version of the blocklist in the various formats -# called by the build list functions (see below) build_lists() { - [[ -z "$comment" ]] && comment='#' # Set default comment character to '#' + # Set default comment character to '#' + [[ -z "$comment" ]] && comment='#' + mkdir -p "lists/${directory}" # Loop through the full and light blocklist versions @@ -21,6 +21,7 @@ build_lists() { list_name='scams_light.txt' source_file="$RAW_LIGHT" fi + blocklist_path="lists/${directory}/${list_name}" cat << EOF > "$blocklist_path" # Append header onto blocklist @@ -36,19 +37,20 @@ EOF # Special case for Unbound format [[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path" + # Append formatted domains onto blocklist printf "%s\n" "$(awk -v before="$before" -v after="$after" \ '{print before $0 after}' "$source_file")" >> "$blocklist_path" done } -# Function 'format_file' is a shell wrapper to standardize the format of a file +# Function 'format_file' is a shell wrapper to standardize the format of a file. # $1: file to format format_file() { bash functions/tools.sh format "$1" } -# Build list functions: +# Build list functions are to specify the syntax of the list format. # $syntax: name of list syntax # $directory: directory to create list in # $comment: character used for comments (default:#) diff --git a/functions/check_dead.sh b/functions/check_dead.sh index cad128393..3d573e3a6 100644 --- a/functions/check_dead.sh +++ b/functions/check_dead.sh @@ -1,112 +1,166 @@ #!/bin/bash -raw_file='data/raw.txt' -raw_light_file='data/raw_light.txt' -domain_log='config/domain_log.csv' -root_domains_file='data/root_domains.txt' -subdomains_file='data/subdomains.txt' -subdomains_to_remove_file='config/subdomains.txt' -wildcards_file='data/wildcards.txt' -redundant_domains_file='data/redundant_domains.txt' -dead_domains_file='data/dead_domains.txt' -time_format=$(date -u +"%H:%M:%S %d-%m-%y") - -function main { - npm i -g @adguard/dead-domains-linter # Install AdGuard Dead Domains Linter - for file in config/* data/*; do # Format files in the config and data directory +# This script checks for dead and resurrected domains and +# removes/adds them accordingly. + +readonly RAW='data/raw.txt' +readonly RAW_LIGHT='data/raw_light.txt' +readonly ROOT_DOMAINS='data/root_domains.txt' +readonly SUBDOMAINS='data/subdomains.txt' +readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt' +readonly WILDCARDS='data/wildcards.txt' +readonly REDUNDANT_DOMAINS='data/redundant_domains.txt' +readonly DOMAIN_LOG='config/domain_log.csv' + +main() { + # Install AdGuard's Dead Domains Linter + npm i -g @adguard/dead-domains-linter + + for file in config/* data/*; do format_list "$file" done + check_subdomains check_redundant check_dead check_alive - cat dead_in_raw.tmp >> "$dead_domains_file" # Collate dead domains (skip alive check) - format_list "$dead_domains_file" update_light_file + + # Cache dead domains (skip processing dead domains through alive check) + cat dead_in_raw.tmp >> "$DEAD_DOMAINS" + format_list "$DEAD_DOMAINS" } -function check_subdomains { - sed 's/^/||/; s/$/^/' "$subdomains_file" > formatted_subdomains_file.tmp # Format subdomains file - dead-domains-linter -i formatted_subdomains_file.tmp --export dead.tmp # Find and export dead domains with subdomains - [[ ! -s dead.tmp ]] && return # Return if no dead domains found +check_subdomains() { + sed 's/^/||/; s/$/^/' "$SUBDOMAINS" > formatted_subdomains.tmp + + # Find and export dead domains with subdomains + dead-domains-linter -i formatted_subdomains.tmp --export dead.tmp + [[ ! -s dead.tmp ]] && return + # Remove dead subdomains from subdomains file - comm -23 "$subdomains_file" dead.tmp > subdomains.tmp && mv subdomains.tmp "$subdomains_file" - cat dead.tmp >> "$dead_domains_file" # Collate dead subdomains to filter out from newly retrieved domains - format_list "$dead_domains_file" + comm -23 "$SUBDOMAINS" dead.tmp > subdomains.tmp + mv subdomains.tmp "$SUBDOMAINS" + + # Cache dead subdomains to filter out from newly retrieved domains + cat dead.tmp >> "$DEAD_DOMAINS" + format_list "$DEAD_DOMAINS" + + # Strip dead domains with subdomains to their root domains + while read -r subdomain; do + dead_root_domains="$(sed "s/^${subdomain}\.//" dead.tmp | sort -u)" + done < "$SUBDOMAINS_TO_REMOVE" - while read -r subdomain; do # Loop through common subdomains - dead_root_domains=$(sed "s/^${subdomain}\.//" dead.tmp | sort -u) # Strip to root domains - done < "$subdomains_to_remove_file" # Remove dead root domains from raw file and root domains file - comm -23 "$raw_file" <(printf "%s" "$dead_root_domains") > raw.tmp && mv raw.tmp "$raw_file" - comm -23 "$root_domains_file" <(printf "%s" "$dead_root_domains") > root.tmp && mv root.tmp "$root_domains_file" - log_event "$dead_root_domains" "dead" "raw" + comm -23 "$RAW" <(printf "%s" "$dead_root_domains") > raw.tmp + mv raw.tmp "$RAW" + comm -23 "$ROOT_DOMAINS" <(printf "%s" "$dead_root_domains") > root.tmp + mv root.tmp "$ROOT_DOMAINS" + + log_event "$dead_root_domains" dead raw } -function check_redundant { - sed 's/^/||/; s/$/^/' "$redundant_domains_file" > formatted_redundant_domains_file.tmp # Format redundant domains file - dead-domains-linter -i formatted_redundant_domains_file.tmp --export dead.tmp # Find and export dead redundant domains - [[ ! -s dead.tmp ]] && return # Return if no dead domains found +check_redundant() { + sed 's/^/||/; s/$/^/' "$REDUNDANT_DOMAINS" > formatted_redundant_domains.tmp + + # Find and export dead redundant domains + dead-domains-linter -i formatted_redundant_domains.tmp --export dead.tmp + [[ ! -s dead.tmp ]] && return + # Remove dead redundant domains from redundant domains file - comm -23 "$redundant_domains_file" dead.tmp > redundant.tmp && mv redundant.tmp "$redundant_domains_file" - cat dead.tmp >> "$dead_domains_file" # Collate dead redundant domains to filter out from newly retrieved domains - format_list "$dead_domains_file" - - while read -r wildcard; do # Loop through wildcards - # If no matches remaining, consider wildcard as dead - ! grep -q "\.${wildcard}$" "$redundant_domains_file" && printf "%s\n" "$wildcard" >> collated_dead_wildcards.tmp - done < "$wildcards_file" - [[ ! -f collated_dead_wildcards.tmp ]] && return # Return if no unused wildcards found + comm -23 "$REDUNDANT_DOMAINS" dead.tmp > redundant.tmp + mv redundant.tmp "$REDUNDANT_DOMAINS" + + # Cache dead redundant domains to filter out from newly retrieved domains + cat dead.tmp >> "$DEAD_DOMAINS" + format_list "$DEAD_DOMAINS" + + # Find unused wildcard + while read -r wildcard; do + # If no matches, consider wildcard as unused/dead + ! grep -q "\.${wildcard}$" "$REDUNDANT_DOMAINS" && + printf "%s\n" "$wildcard" >> collated_dead_wildcards.tmp + done < "$WILDCARDS" + [[ ! -f collated_dead_wildcards.tmp ]] && return sort -u collated_dead_wildcards.tmp -o collated_dead_wildcards.tmp + # Remove unused wildcards from raw file and wildcards file - comm -23 "$raw_file" collated_dead_wildcards.tmp > raw.tmp && mv raw.tmp "$raw_file" - comm -23 "$wildcards_file" collated_dead_wildcards.tmp > wildcards.tmp && mv wildcards.tmp "$wildcards_file" - log_event "$( raw.tmp + mv raw.tmp "$RAW" + comm -23 "$WILDCARDS" collated_dead_wildcards.tmp > wildcards.tmp + mv wildcards.tmp "$WILDCARDS" + + log_event "$( formatted_raw_file.tmp # Format raw file - dead-domains-linter -i formatted_raw_file.tmp --export dead_in_raw.tmp # Find and export dead domains - [[ ! -s dead_in_raw.tmp ]] && return # Return if no dead domains found +check_dead() { + # Exclude wildcards and root domains of subdomains + comm -23 "$RAW" <(sort "$ROOT_DOMAINS" "$WILDCARDS") | + sed 's/^/||/; s/$/^/' > formatted_raw.tmp + + # Find and export dead domains + dead-domains-linter -i formatted_raw.tmp --export dead_in_raw.tmp + [[ ! -s dead_in_raw.tmp ]] && return + # Remove dead domains from raw file - comm -23 "$raw_file" dead_in_raw.tmp > raw.tmp && mv raw.tmp "$raw_file" - log_event "$( raw.tmp && mv raw.tmp "$RAW" + + log_event "$( formatted_dead_domains_file.tmp # Format dead domains file - dead-domains-linter -i formatted_dead_domains_file.tmp --export dead.tmp # Find dead domains in the dead domains file - alive_domains=$(comm -23 <(sort "$dead_domains_file") <(sort dead.tmp)) # Find resurrected domains in dead domains file (note dead domains file is unsorted) - [[ -z "$alive_domains" ]] && return # Return if no resurrected domains found - cp dead.tmp "$dead_domains_file" # Update dead domains file to exclude resurrected domains - - # Strip away subdomains from alive domains since subdomains are not supposed to be in raw file - while read -r subdomain; do # Loop through common subdomains - alive_domains=$(printf "%s" "$alive_domains" | sed "s/^${subdomain}\.//" | sort -u) - done < "$subdomains_to_remove_file" - printf "%s\n" "$alive_domains" >> "$raw_file" # Add resurrected domains to raw file - format_list "$dead_domains_file" && format_list "$raw_file" - log_event "$alive_domains" "resurrected" "dead_domains_file" +check_alive() { + sed 's/^/||/; s/$/^/' "$DEAD_DOMAINS" > formatted_dead_domains.tmp + + # Find and export dead domains + dead-domains-linter -i formatted_dead_domains.tmp --export dead.tmp + + # Find resurrected domains in dead domains file (note dead domains file is unsorted) + alive_domains="$(comm -23 <(sort "$DEAD_DOMAINS") <(sort dead.tmp))" + [[ -z "$alive_domains" ]] && return + + # Update dead domains file to only include dead domains + cp dead.tmp "$DEAD_DOMAINS" + format_list "$DEAD_DOMAINS" + + # Strip away subdomains from alive domains as subdomains are not supposed to be in raw file + while read -r subdomain; do + alive_domains="$(printf "%s" "$alive_domains" | sed "s/^${subdomain}\.//" | sort -u)" + done < "$SUBDOMAINS_TO_REMOVE" + + printf "%s\n" "$alive_domains" >> "$RAW" # Add resurrected domains to raw file + format_list "$RAW" + + log_event "$alive_domains" resurrected dead_domains } -function update_light_file { - comm -12 "$raw_file" "$raw_light_file" > light.tmp && mv light.tmp "$raw_light_file" # Keep only domains found in full raw file +# Function 'update_light_file' removes any domains from the light raw file that +# are not found in the full raw file. +update_light_file() { + comm -12 "$RAW" "$RAW_LIGHT" > light.tmp && mv light.tmp "$RAW_LIGHT" } -function prune_dead_domains_file { - [[ $(wc -l < "$dead_domains_file") -gt 5000 ]] && sed -i '1,100d' "$dead_domains_file" || printf "" # printf to negate exit status 1 +# Function 'prune_dead_domains_file' removes old entries once the file reaches +# a threshold of entries. +prune_dead_domains_file() { + [[ $(wc -l < "$DEAD_DOMAINS") -gt 5000 ]] && sed -i '1,100d' "$DEAD_DOMAINS" + true } -function log_event { - # Log domain events - printf "%s\n" "$1" | awk -v type="$2" -v source="$3" -v time="$time_format" '{print time "," type "," $0 "," source}' >> "$domain_log" +# Function 'log_event' logs domain processing events into the domain log +# $1: domains to log stored in a variable. +# $2: event type (dead, whitelisted, etc.) +# $3: source +log_event() { + printf "%s\n" "$1" | awk -v type="$2" -v source="$3" -v time="$(date -u +"%H:%M:%S %d-%m-%y")" \ + '{print time "," type "," $0 "," source}' >> "$DOMAIN_LOG" } -function format_list { - bash functions/tools.sh "format" "$1" +# Function 'format_file' is a shell wrapper to standardize the format of a file. +# $1: file to format +format_file() { + bash functions/tools.sh format "$1" } -function cleanup { +cleanup() { find . -maxdepth 1 -type f -name "*.tmp" -delete prune_dead_domains_file } diff --git a/functions/tools.sh b/functions/tools.sh index fe1967a45..3c55c04e1 100644 --- a/functions/tools.sh +++ b/functions/tools.sh @@ -1,11 +1,9 @@ #!/bin/bash - # Tools.sh is a shell wrapper intended to store commonly used functions. -# 'format' is called to standardize the format of a file. +# Function 'format' is called to standardize the format of a file. format() { local -r file="$1" - [[ ! -f "$file" ]] && return # Applicable to all files: @@ -21,8 +19,7 @@ format() { ;; ('config/parked_terms.txt') # Remove empty lines, convert to lowercase, sort and remove duplicates - sed '/^$/d' "$file" | tr '[:upper:]' '[:lower:]' | - sort -u -o "${file}.tmp" + sed '/^$/d' "$file" | tr '[:upper:]' '[:lower:]' | sort -u -o "${file}.tmp" mv "${file}.tmp" "$file" ;; (*.txt|*.tmp) diff --git a/functions/update_readme.sh b/functions/update_readme.sh index 8723ffde6..f671a923f 100644 --- a/functions/update_readme.sh +++ b/functions/update_readme.sh @@ -1,14 +1,14 @@ #!/bin/bash -# This script updates README.md content and statistics +# This script updates README.md content and statistics. readonly RAW='data/raw.txt' readonly RAW_LIGHT='data/raw_light.txt' readonly SEARCH_TERMS='config/search_terms.csv' readonly SOURCE_LOG='config/SOURCE_LOG.csv' -readonly TODAY -readonly YESTERDAY TODAY="$(date -u +"%d-%m-%y")" YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")" +readonly TODAY +readonly YESTERDAY update_readme() { cat << EOF > README.md @@ -135,15 +135,15 @@ EOF } # Function 'print_stats' prints the various statistics for each source -# $1: source to process (leave blank to process all sources) +# $1: source to process (leave blank to process all sources). print_stats() { [[ -n "$1" ]] && source="$1" || source='All sources' printf "%5s |%10s |%8s%% | %s\n" "$(sum "$TODAY" "$1")" \ "$(sum "$YESTERDAY" "$1")" "$(sum_excluded "$1" )" "$source" } -# Function 'sum' is an echo wrapper that sums up the domains retrieved by that source for -# that particular day +# Function 'sum' is an echo wrapper that sums up the domains retrieved by +# that source for that particular day. # $1: day to process # $2: source to process sum() { @@ -154,7 +154,7 @@ sum() { } # Function 'count_excluded' is an echo wrapper that counts the % of excluded domains -# of raw count retrieved from each source +# of raw count retrieved from each source. # $1: source to process count_excluded() { csvgrep -c 2 -m "$1" "$SOURCE_LOG" | csvgrep -c 14 -m yes > source_rows.tmp @@ -173,7 +173,7 @@ count_excluded() { } # Function 'format_file' is a shell wrapper to standardize the format of a file -# $1: file to format +# $1: file to format. format_file() { bash functions/tools.sh format "$1" } diff --git a/functions/validate_raw.sh b/functions/validate_raw.sh index ec869443b..ca79d499e 100644 --- a/functions/validate_raw.sh +++ b/functions/validate_raw.sh @@ -1,5 +1,5 @@ #!/bin/bash -# This script validates the domains in the raw file via a variety of checks +# This script validates the domains in the raw file via a variety of checks. readonly RAW='data/raw.txt' readonly RAW_LIGHT='data/raw_light.txt' @@ -12,11 +12,7 @@ readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt' readonly WILDCARDS='data/wildcards.txt' readonly REDUNDANT_DOMAINS='data/redundant_domains.txt' readonly DOMAIN_LOG='config/domain_log.csv' -readonly TIME_FORMAT -TIME_FORMAT="$(date -u +"%H:%M:%S %d-%m-%y")" -# Function 'validate_raw' stores the domains in the raw file in a variable and validates them -# via a variety of checks validate_raw() { domains="$(<"$RAW")" before_count="$(wc -l < "$RAW")" @@ -26,8 +22,11 @@ validate_raw() { while read -r subdomain; do # Loop through common subdomains domains_with_subdomains="$(grep "^${subdomain}\." <<< "$domains")" [[ -z "$domains_with_subdomains" ]] && continue + # Count number of domains with common subdomains - domains_with_subdomains_count="$((domains_with_subdomains_count + $(wc -l <<< "$domains_with_subdomains")))" + domains_with_subdomains_count="$(( + domains_with_subdomains_count + $(wc -l <<< "$domains_with_subdomains") + ))" # Keep only root domains domains="$(printf "%s" "$domains" | sed "s/^${subdomain}\.//" | sort -u)" @@ -40,7 +39,7 @@ validate_raw() { printf "%s\n" "$domains_with_subdomains" | sed "s/^${subdomain}\.//" >> root_domains.tmp awk '{print $0 " (subdomain)"}' <<< "$domains_with_subdomains" >> filter_log.tmp - log_event "$domains_with_subdomains" subdomain + log_event "$domains_with_subdomains" subdomain raw done < "$SUBDOMAINS_TO_REMOVE" format_file subdomains.tmp format_file root_domains.tmp @@ -51,7 +50,7 @@ validate_raw() { if [[ "$whitelisted_count" -gt 0 ]]; then domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$whitelisted_domains"))" awk '{print $0 " (whitelisted)"}' <<< "$whitelisted_domains" >> filter_log.tmp - log_event "$whitelisted_domains" whitelist + log_event "$whitelisted_domains" whitelist raw fi # Remove domains that have whitelisted TLDs @@ -60,7 +59,7 @@ validate_raw() { if [[ "$whitelisted_tld_count" -gt 0 ]]; then domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$whitelisted_tld_domains"))" awk '{print $0 " (whitelisted TLD)"}' <<< "$whitelisted_tld_domains" >> filter_log.tmp - log_event "$whitelisted_tld_domains" tld + log_event "$whitelisted_tld_domains" tld raw fi # Remove invalid entries including IP addresses. This excludes punycode TLDs (.xn--*) @@ -69,7 +68,7 @@ validate_raw() { if [[ "$invalid_entries_count" -gt 0 ]]; then domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$invalid_entries"))" awk '{print $0 " (invalid)"}' <<< "$invalid_entries" >> filter_log.tmp - log_event "$invalid_entries" invalid + log_event "$invalid_entries" invalid raw fi # Remove redundant domains @@ -78,6 +77,7 @@ validate_raw() { # Find redundant domains via wildcard matching redundant_domains="$(grep "\.${domain}$" <<< "$domains")" [[ -z "$redundant_domains" ]] && continue + # Count number of redundant domains redundant_count="$((redundant_count + $(wc -l <<< "$redundant_domains")))" @@ -90,7 +90,7 @@ validate_raw() { printf "%s\n" "$domain" >> wildcards.tmp awk '{print $0 " (redundant)"}' <<< "$redundant_domains" >> filter_log.tmp - log_event "$redundant_domains" redundant + log_event "$redundant_domains" redundant raw done <<< "$domains" format_file redundant_domains.tmp format_file wildcards.tmp @@ -101,7 +101,7 @@ validate_raw() { if [[ "$toplist_count" -gt 0 ]]; then awk '{print $0 " (toplist) - \033[1;31mmanual removal required\033[0m"}' \ <<< "$domains_in_toplist" >> filter_log.tmp - log_event "$domains_in_toplist" toplist + log_event "$domains_in_toplist" toplist raw fi # Exit if no filtering done @@ -141,26 +141,29 @@ validate_raw() { printf "%s\n" "$domains" > "$RAW" format_file "$RAW" + # Remove filtered domains from light file comm -12 "$RAW" "$RAW_LIGHT" > light.tmp && mv light.tmp "$RAW_LIGHT" total_whitelisted_count="$((whitelisted_count + whitelisted_tld_count))" after_count="$(wc -l < "$RAW")" printf "\nBefore: %s After: %s Subdomains: %s Whitelisted: %s Invalid %s Redundant: %s Toplist: %s\n\n" \ - "$before_count" "$after_count" "$domains_with_subdomains_count" "$total_whitelisted_count" "$invalid_entries_count" "$redundant_count" "$toplist_count" + "$before_count" "$after_count" "$domains_with_subdomains_count" "$total_whitelisted_count" \ + "$invalid_entries_count" "$redundant_count" "$toplist_count" exit 1 } -# Function 'log_event' logs domain processing events into the domain log +# Function 'log_event' logs domain processing events into the domain log. # $1: domains to log stored in a variable # $2: event type (dead, whitelisted, etc.) +# $3: source log_event() { - printf "%s\n" "$1" | awk -v type="$2" -v source=raw -v time="$TIME_FORMAT" \ + printf "%s\n" "$1" | awk -v type="$2" -v source="$3" -v time="$(date -u +"%H:%M:%S %d-%m-%y")" \ '{print time "," type "," $0 "," source}' >> "$DOMAIN_LOG" } -# Function 'format_file' is a shell wrapper to standardize the format of a file +# Function 'format_file' is a shell wrapper to standardize the format of a file. # $1: file to format format_file() { bash functions/tools.sh format "$1"