Update check_dead

jarelllama · Mar 31, 2024 · 13d742d · 13d742d
1 parent 048bcca
commit 13d742d
Show file tree

Hide file tree

Showing 5 changed files with 167 additions and 111 deletions.
diff --git a/functions/build_lists.sh b/functions/build_lists.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
-# This script builds the various formats of list from the raw files
+# This script builds the various formats of list from the raw files.
 
 readonly RAW='data/raw.txt'
 readonly RAW_LIGHT='data/raw_light.txt'
 
-# Function 'build_lists' builds the two version of the blocklist in the various formats
-# called by the build list functions (see below)
 build_lists() {
-    [[ -z "$comment" ]] && comment='#'  # Set default comment character to '#'
+    # Set default comment character to '#'
+    [[ -z "$comment" ]] && comment='#'
+
     mkdir -p "lists/${directory}"
 
     # Loop through the full and light blocklist versions
@@ -21,6 +21,7 @@ build_lists() {
             list_name='scams_light.txt'
             source_file="$RAW_LIGHT"
         fi
+
         blocklist_path="lists/${directory}/${list_name}"
 
         cat << EOF > "$blocklist_path"  # Append header onto blocklist
@@ -36,19 +37,20 @@ EOF
 
         # Special case for Unbound format
         [[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path"
+
         # Append formatted domains onto blocklist
         printf "%s\n" "$(awk -v before="$before" -v after="$after" \
             '{print before $0 after}' "$source_file")" >> "$blocklist_path"
     done
 }
 
-# Function 'format_file' is a shell wrapper to standardize the format of a file
+# Function 'format_file' is a shell wrapper to standardize the format of a file.
 # $1: file to format
 format_file() {
     bash functions/tools.sh format "$1"
 }
 
-# Build list functions:
+# Build list functions are to specify the syntax of the list format.
 # $syntax: name of list syntax
 # $directory: directory to create list in
 # $comment: character used for comments (default:#)

diff --git a/functions/check_dead.sh b/functions/check_dead.sh
@@ -1,112 +1,166 @@
 #!/bin/bash
-raw_file='data/raw.txt'
-raw_light_file='data/raw_light.txt'
-domain_log='config/domain_log.csv'
-root_domains_file='data/root_domains.txt'
-subdomains_file='data/subdomains.txt'
-subdomains_to_remove_file='config/subdomains.txt'
-wildcards_file='data/wildcards.txt'
-redundant_domains_file='data/redundant_domains.txt'
-dead_domains_file='data/dead_domains.txt'
-time_format=$(date -u +"%H:%M:%S %d-%m-%y")
-
-function main {
-    npm i -g @adguard/dead-domains-linter  # Install AdGuard Dead Domains Linter
-    for file in config/* data/*; do  # Format files in the config and data directory
+# This script checks for dead and resurrected domains and
+# removes/adds them accordingly.
+
+readonly RAW='data/raw.txt'
+readonly RAW_LIGHT='data/raw_light.txt'
+readonly ROOT_DOMAINS='data/root_domains.txt'
+readonly SUBDOMAINS='data/subdomains.txt'
+readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt'
+readonly WILDCARDS='data/wildcards.txt'
+readonly REDUNDANT_DOMAINS='data/redundant_domains.txt'
+readonly DOMAIN_LOG='config/domain_log.csv'
+
+main() {
+    # Install AdGuard's Dead Domains Linter
+    npm i -g @adguard/dead-domains-linter
+
+    for file in config/* data/*; do
         format_list "$file"
     done
+
     check_subdomains
     check_redundant
     check_dead
     check_alive
-    cat dead_in_raw.tmp >> "$dead_domains_file"  # Collate dead domains (skip alive check)
-    format_list "$dead_domains_file"
     update_light_file
+
+    # Cache dead domains (skip processing dead domains through alive check)
+    cat dead_in_raw.tmp >> "$DEAD_DOMAINS"
+    format_list "$DEAD_DOMAINS"
 }
 
-function check_subdomains {
-    sed 's/^/||/; s/$/^/' "$subdomains_file" > formatted_subdomains_file.tmp  # Format subdomains file
-    dead-domains-linter -i formatted_subdomains_file.tmp --export dead.tmp  # Find and export dead domains with subdomains
-    [[ ! -s dead.tmp ]] && return  # Return if no dead domains found
+check_subdomains() {
+    sed 's/^/||/; s/$/^/' "$SUBDOMAINS" > formatted_subdomains.tmp
+
+    # Find and export dead domains with subdomains
+    dead-domains-linter -i formatted_subdomains.tmp --export dead.tmp
+    [[ ! -s dead.tmp ]] && return
+
     # Remove dead subdomains from subdomains file
-    comm -23 "$subdomains_file" dead.tmp > subdomains.tmp && mv subdomains.tmp "$subdomains_file"
-    cat dead.tmp >> "$dead_domains_file"  # Collate dead subdomains to filter out from newly retrieved domains
-    format_list "$dead_domains_file"
+    comm -23 "$SUBDOMAINS" dead.tmp > subdomains.tmp
+    mv subdomains.tmp "$SUBDOMAINS"
+
+    # Cache dead subdomains to filter out from newly retrieved domains
+    cat dead.tmp >> "$DEAD_DOMAINS"
+    format_list "$DEAD_DOMAINS"
+
+    # Strip dead domains with subdomains to their root domains
+    while read -r subdomain; do
+        dead_root_domains="$(sed "s/^${subdomain}\.//" dead.tmp | sort -u)"
+    done < "$SUBDOMAINS_TO_REMOVE"
 
-    while read -r subdomain; do  # Loop through common subdomains
-        dead_root_domains=$(sed "s/^${subdomain}\.//" dead.tmp | sort -u)  # Strip to root domains
-    done < "$subdomains_to_remove_file"
     # Remove dead root domains from raw file and root domains file
-    comm -23 "$raw_file" <(printf "%s" "$dead_root_domains") > raw.tmp && mv raw.tmp "$raw_file"
-    comm -23 "$root_domains_file" <(printf "%s" "$dead_root_domains") > root.tmp && mv root.tmp "$root_domains_file"
-    log_event "$dead_root_domains" "dead" "raw"
+    comm -23 "$RAW" <(printf "%s" "$dead_root_domains") > raw.tmp
+    mv raw.tmp "$RAW"
+    comm -23 "$ROOT_DOMAINS" <(printf "%s" "$dead_root_domains") > root.tmp
+    mv root.tmp "$ROOT_DOMAINS"
+
+    log_event "$dead_root_domains" dead raw
 }
 
-function check_redundant {
-    sed 's/^/||/; s/$/^/' "$redundant_domains_file" > formatted_redundant_domains_file.tmp  # Format redundant domains file
-    dead-domains-linter -i formatted_redundant_domains_file.tmp --export dead.tmp  # Find and export dead redundant domains
-    [[ ! -s dead.tmp ]] && return  # Return if no dead domains found
+check_redundant() {
+    sed 's/^/||/; s/$/^/' "$REDUNDANT_DOMAINS" > formatted_redundant_domains.tmp
+
+    # Find and export dead redundant domains
+    dead-domains-linter -i formatted_redundant_domains.tmp --export dead.tmp
+    [[ ! -s dead.tmp ]] && return
+
     # Remove dead redundant domains from redundant domains file
-    comm -23 "$redundant_domains_file" dead.tmp > redundant.tmp && mv redundant.tmp "$redundant_domains_file"
-    cat dead.tmp >> "$dead_domains_file"  # Collate dead redundant domains to filter out from newly retrieved domains
-    format_list "$dead_domains_file"
-
-    while read -r wildcard; do  # Loop through wildcards
-        # If no matches remaining, consider wildcard as dead
-        ! grep -q "\.${wildcard}$" "$redundant_domains_file" && printf "%s\n" "$wildcard" >> collated_dead_wildcards.tmp
-    done < "$wildcards_file"
-    [[ ! -f collated_dead_wildcards.tmp ]] && return  # Return if no unused wildcards found
+    comm -23 "$REDUNDANT_DOMAINS" dead.tmp > redundant.tmp
+    mv redundant.tmp "$REDUNDANT_DOMAINS"
+
+    # Cache dead redundant domains to filter out from newly retrieved domains
+    cat dead.tmp >> "$DEAD_DOMAINS"
+    format_list "$DEAD_DOMAINS"
+
+    # Find unused wildcard
+    while read -r wildcard; do
+        # If no matches, consider wildcard as unused/dead
+        ! grep -q "\.${wildcard}$" "$REDUNDANT_DOMAINS" &&
+            printf "%s\n" "$wildcard" >> collated_dead_wildcards.tmp
+    done < "$WILDCARDS"
+    [[ ! -f collated_dead_wildcards.tmp ]] && return
     sort -u collated_dead_wildcards.tmp -o collated_dead_wildcards.tmp
+
     # Remove unused wildcards from raw file and wildcards file
-    comm -23 "$raw_file" collated_dead_wildcards.tmp > raw.tmp && mv raw.tmp "$raw_file"
-    comm -23 "$wildcards_file" collated_dead_wildcards.tmp > wildcards.tmp && mv wildcards.tmp "$wildcards_file"
-    log_event "$(<collated_dead_wildcards.tmp)" "dead" "wildcard"
+    comm -23 "$RAW" collated_dead_wildcards.tmp > raw.tmp
+    mv raw.tmp "$RAW"
+    comm -23 "$WILDCARDS" collated_dead_wildcards.tmp > wildcards.tmp
+    mv wildcards.tmp "$WILDCARDS"
+
+    log_event "$(<collated_dead_wildcards.tmp)" dead wildcard
 }
 
-function check_dead {
-    comm -23 "$raw_file" <(sort "$root_domains_file" "$wildcards_file") |  # Exclude wildcards and root domains of subdomains
-        sed 's/^/||/; s/$/^/' > formatted_raw_file.tmp  # Format raw file
-    dead-domains-linter -i formatted_raw_file.tmp --export dead_in_raw.tmp  # Find and export dead domains
-    [[ ! -s dead_in_raw.tmp ]] && return  # Return if no dead domains found
+check_dead() {
+    # Exclude wildcards and root domains of subdomains
+    comm -23 "$RAW" <(sort "$ROOT_DOMAINS" "$WILDCARDS") |
+        sed 's/^/||/; s/$/^/' > formatted_raw.tmp
+
+    # Find and export dead domains
+    dead-domains-linter -i formatted_raw.tmp --export dead_in_raw.tmp
+    [[ ! -s dead_in_raw.tmp ]] && return
+
     # Remove dead domains from raw file
-    comm -23 "$raw_file" dead_in_raw.tmp > raw.tmp && mv raw.tmp "$raw_file"
-    log_event "$(<dead_in_raw.tmp)" "dead" "raw"
+    comm -23 "$RAW" dead_in_raw.tmp > raw.tmp && mv raw.tmp "$RAW"
+
+    log_event "$(<dead_in_raw.tmp)" dead raw
 }
 
-function check_alive {
-    sed 's/^/||/; s/$/^/' "$dead_domains_file" > formatted_dead_domains_file.tmp  # Format dead domains file
-    dead-domains-linter -i formatted_dead_domains_file.tmp --export dead.tmp  # Find dead domains in the dead domains file
-    alive_domains=$(comm -23 <(sort "$dead_domains_file") <(sort dead.tmp))  # Find resurrected domains in dead domains file (note dead domains file is unsorted)
-    [[ -z "$alive_domains" ]] && return  # Return if no resurrected domains found
-    cp dead.tmp "$dead_domains_file"  # Update dead domains file to exclude resurrected domains
-
-    # Strip away subdomains from alive domains since subdomains are not supposed to be in raw file
-    while read -r subdomain; do  # Loop through common subdomains
-        alive_domains=$(printf "%s" "$alive_domains" | sed "s/^${subdomain}\.//" | sort -u)
-    done < "$subdomains_to_remove_file"
-    printf "%s\n" "$alive_domains" >> "$raw_file"  # Add resurrected domains to raw file
-    format_list "$dead_domains_file" && format_list "$raw_file"
-    log_event "$alive_domains" "resurrected" "dead_domains_file"
+check_alive() {
+    sed 's/^/||/; s/$/^/' "$DEAD_DOMAINS" > formatted_dead_domains.tmp
+
+    # Find and export dead domains
+    dead-domains-linter -i formatted_dead_domains.tmp --export dead.tmp
+
+    # Find resurrected domains in dead domains file (note dead domains file is unsorted)
+    alive_domains="$(comm -23 <(sort "$DEAD_DOMAINS") <(sort dead.tmp))"
+    [[ -z "$alive_domains" ]] && return
+
+    # Update dead domains file to only include dead domains
+    cp dead.tmp "$DEAD_DOMAINS"
+    format_list "$DEAD_DOMAINS"
+
+    # Strip away subdomains from alive domains as subdomains are not supposed to be in raw file
+    while read -r subdomain; do
+        alive_domains="$(printf "%s" "$alive_domains" | sed "s/^${subdomain}\.//" | sort -u)"
+    done < "$SUBDOMAINS_TO_REMOVE"
+
+    printf "%s\n" "$alive_domains" >> "$RAW"  # Add resurrected domains to raw file
+    format_list "$RAW"
+
+    log_event "$alive_domains" resurrected dead_domains
 }
 
-function update_light_file {
-    comm -12 "$raw_file" "$raw_light_file" > light.tmp && mv light.tmp "$raw_light_file"  # Keep only domains found in full raw file
+# Function 'update_light_file' removes any domains from the light raw file that
+# are not found in the full raw file.
+update_light_file() {
+    comm -12 "$RAW" "$RAW_LIGHT" > light.tmp && mv light.tmp "$RAW_LIGHT"
 }
 
-function prune_dead_domains_file {
-    [[ $(wc -l < "$dead_domains_file") -gt 5000 ]] && sed -i '1,100d' "$dead_domains_file" || printf ""  # printf to negate exit status 1
+# Function 'prune_dead_domains_file' removes old entries once the file reaches
+# a threshold of entries.
+prune_dead_domains_file() {
+    [[ $(wc -l < "$DEAD_DOMAINS") -gt 5000 ]] && sed -i '1,100d' "$DEAD_DOMAINS"
+    true
 }
 
-function log_event {
-    # Log domain events
-    printf "%s\n" "$1" | awk -v type="$2" -v source="$3" -v time="$time_format" '{print time "," type "," $0 "," source}' >> "$domain_log"
+# Function 'log_event' logs domain processing events into the domain log
+# $1: domains to log stored in a variable.
+# $2: event type (dead, whitelisted, etc.)
+# $3: source
+log_event() {
+    printf "%s\n" "$1" | awk -v type="$2" -v source="$3" -v time="$(date -u +"%H:%M:%S %d-%m-%y")" \
+        '{print time "," type "," $0 "," source}' >> "$DOMAIN_LOG"
 }
 
-function format_list {
-    bash functions/tools.sh "format" "$1"
+# Function 'format_file' is a shell wrapper to standardize the format of a file.
+# $1: file to format
+format_file() {
+    bash functions/tools.sh format "$1"
 }
 
-function cleanup {
+cleanup() {
     find . -maxdepth 1 -type f -name "*.tmp" -delete
     prune_dead_domains_file
 }

diff --git a/functions/tools.sh b/functions/tools.sh
@@ -1,11 +1,9 @@
 #!/bin/bash
-
 # Tools.sh is a shell wrapper intended to store commonly used functions.
 
-# 'format' is called to standardize the format of a file.
+# Function 'format' is called to standardize the format of a file.
 format() {
     local -r file="$1"
-
     [[ ! -f "$file" ]] && return
 
     # Applicable to all files:
@@ -21,8 +19,7 @@ format() {
             ;;
         ('config/parked_terms.txt')
             # Remove empty lines, convert to lowercase, sort and remove duplicates
-            sed '/^$/d' "$file" | tr '[:upper:]' '[:lower:]' |
-                sort -u -o "${file}.tmp"
+            sed '/^$/d' "$file" | tr '[:upper:]' '[:lower:]' | sort -u -o "${file}.tmp"
             mv "${file}.tmp" "$file"
             ;;
         (*.txt|*.tmp)

diff --git a/functions/update_readme.sh b/functions/update_readme.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
-# This script updates README.md content and statistics
+# This script updates README.md content and statistics.
 
 readonly RAW='data/raw.txt'
 readonly RAW_LIGHT='data/raw_light.txt'
 readonly SEARCH_TERMS='config/search_terms.csv'
 readonly SOURCE_LOG='config/SOURCE_LOG.csv'
-readonly TODAY
-readonly YESTERDAY
 TODAY="$(date -u +"%d-%m-%y")"
 YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")"
+readonly TODAY
+readonly YESTERDAY
 
 update_readme() {
     cat << EOF > README.md
@@ -135,15 +135,15 @@ EOF
 }
 
 # Function 'print_stats' prints the various statistics for each source
-# $1: source to process (leave blank to process all sources)
+# $1: source to process (leave blank to process all sources).
 print_stats() {
     [[ -n "$1" ]] && source="$1" || source='All sources'
     printf "%5s |%10s |%8s%% | %s\n" "$(sum "$TODAY" "$1")" \
         "$(sum "$YESTERDAY" "$1")" "$(sum_excluded "$1" )" "$source"
 }
 
-# Function 'sum' is an echo wrapper that sums up the domains retrieved by that source for
-# that particular day
+# Function 'sum' is an echo wrapper that sums up the domains retrieved by
+# that source for that particular day.
 # $1: day to process
 # $2: source to process
 sum() {
@@ -154,7 +154,7 @@ sum() {
 }
 
 # Function 'count_excluded' is an echo wrapper that counts the % of excluded domains
-# of raw count retrieved from each source
+# of raw count retrieved from each source.
 # $1: source to process
 count_excluded() {
     csvgrep -c 2 -m "$1" "$SOURCE_LOG" | csvgrep -c 14 -m yes > source_rows.tmp
@@ -173,7 +173,7 @@ count_excluded() {
 }
 
 # Function 'format_file' is a shell wrapper to standardize the format of a file
-# $1: file to format
+# $1: file to format.
 format_file() {
     bash functions/tools.sh format "$1"
 }