Skip to content

Commit

Permalink
Update check_dead
Browse files Browse the repository at this point in the history
  • Loading branch information
jarelllama authored Mar 31, 2024
1 parent 048bcca commit 13d742d
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 111 deletions.
14 changes: 8 additions & 6 deletions functions/build_lists.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/bin/bash
# This script builds the various formats of list from the raw files
# This script builds the various formats of list from the raw files.

readonly RAW='data/raw.txt'
readonly RAW_LIGHT='data/raw_light.txt'

# Function 'build_lists' builds the two version of the blocklist in the various formats
# called by the build list functions (see below)
build_lists() {
[[ -z "$comment" ]] && comment='#' # Set default comment character to '#'
# Set default comment character to '#'
[[ -z "$comment" ]] && comment='#'

mkdir -p "lists/${directory}"

# Loop through the full and light blocklist versions
Expand All @@ -21,6 +21,7 @@ build_lists() {
list_name='scams_light.txt'
source_file="$RAW_LIGHT"
fi

blocklist_path="lists/${directory}/${list_name}"

cat << EOF > "$blocklist_path" # Append header onto blocklist
Expand All @@ -36,19 +37,20 @@ EOF

# Special case for Unbound format
[[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path"

# Append formatted domains onto blocklist
printf "%s\n" "$(awk -v before="$before" -v after="$after" \
'{print before $0 after}' "$source_file")" >> "$blocklist_path"
done
}

# Function 'format_file' is a shell wrapper to standardize the format of a file
# Function 'format_file' is a shell wrapper to standardize the format of a file.
# $1: file to format
format_file() {
bash functions/tools.sh format "$1"
}

# Build list functions:
# Build list functions are to specify the syntax of the list format.
# $syntax: name of list syntax
# $directory: directory to create list in
# $comment: character used for comments (default:#)
Expand Down
206 changes: 130 additions & 76 deletions functions/check_dead.sh
Original file line number Diff line number Diff line change
@@ -1,112 +1,166 @@
#!/bin/bash
raw_file='data/raw.txt'
raw_light_file='data/raw_light.txt'
domain_log='config/domain_log.csv'
root_domains_file='data/root_domains.txt'
subdomains_file='data/subdomains.txt'
subdomains_to_remove_file='config/subdomains.txt'
wildcards_file='data/wildcards.txt'
redundant_domains_file='data/redundant_domains.txt'
dead_domains_file='data/dead_domains.txt'
time_format=$(date -u +"%H:%M:%S %d-%m-%y")

function main {
npm i -g @adguard/dead-domains-linter # Install AdGuard Dead Domains Linter
for file in config/* data/*; do # Format files in the config and data directory
# This script checks for dead and resurrected domains and
# removes/adds them accordingly.

readonly RAW='data/raw.txt'
readonly RAW_LIGHT='data/raw_light.txt'
readonly ROOT_DOMAINS='data/root_domains.txt'
readonly SUBDOMAINS='data/subdomains.txt'
readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt'
readonly WILDCARDS='data/wildcards.txt'
readonly REDUNDANT_DOMAINS='data/redundant_domains.txt'
readonly DOMAIN_LOG='config/domain_log.csv'

main() {
# Install AdGuard's Dead Domains Linter
npm i -g @adguard/dead-domains-linter

for file in config/* data/*; do
format_list "$file"
done

check_subdomains
check_redundant
check_dead
check_alive
cat dead_in_raw.tmp >> "$dead_domains_file" # Collate dead domains (skip alive check)
format_list "$dead_domains_file"
update_light_file

# Cache dead domains (skip processing dead domains through alive check)
cat dead_in_raw.tmp >> "$DEAD_DOMAINS"
format_list "$DEAD_DOMAINS"
}

function check_subdomains {
sed 's/^/||/; s/$/^/' "$subdomains_file" > formatted_subdomains_file.tmp # Format subdomains file
dead-domains-linter -i formatted_subdomains_file.tmp --export dead.tmp # Find and export dead domains with subdomains
[[ ! -s dead.tmp ]] && return # Return if no dead domains found
check_subdomains() {
sed 's/^/||/; s/$/^/' "$SUBDOMAINS" > formatted_subdomains.tmp

# Find and export dead domains with subdomains
dead-domains-linter -i formatted_subdomains.tmp --export dead.tmp
[[ ! -s dead.tmp ]] && return

# Remove dead subdomains from subdomains file
comm -23 "$subdomains_file" dead.tmp > subdomains.tmp && mv subdomains.tmp "$subdomains_file"
cat dead.tmp >> "$dead_domains_file" # Collate dead subdomains to filter out from newly retrieved domains
format_list "$dead_domains_file"
comm -23 "$SUBDOMAINS" dead.tmp > subdomains.tmp
mv subdomains.tmp "$SUBDOMAINS"

# Cache dead subdomains to filter out from newly retrieved domains
cat dead.tmp >> "$DEAD_DOMAINS"
format_list "$DEAD_DOMAINS"

# Strip dead domains with subdomains to their root domains
while read -r subdomain; do
dead_root_domains="$(sed "s/^${subdomain}\.//" dead.tmp | sort -u)"
done < "$SUBDOMAINS_TO_REMOVE"

while read -r subdomain; do # Loop through common subdomains
dead_root_domains=$(sed "s/^${subdomain}\.//" dead.tmp | sort -u) # Strip to root domains
done < "$subdomains_to_remove_file"
# Remove dead root domains from raw file and root domains file
comm -23 "$raw_file" <(printf "%s" "$dead_root_domains") > raw.tmp && mv raw.tmp "$raw_file"
comm -23 "$root_domains_file" <(printf "%s" "$dead_root_domains") > root.tmp && mv root.tmp "$root_domains_file"
log_event "$dead_root_domains" "dead" "raw"
comm -23 "$RAW" <(printf "%s" "$dead_root_domains") > raw.tmp
mv raw.tmp "$RAW"
comm -23 "$ROOT_DOMAINS" <(printf "%s" "$dead_root_domains") > root.tmp
mv root.tmp "$ROOT_DOMAINS"

log_event "$dead_root_domains" dead raw
}

function check_redundant {
sed 's/^/||/; s/$/^/' "$redundant_domains_file" > formatted_redundant_domains_file.tmp # Format redundant domains file
dead-domains-linter -i formatted_redundant_domains_file.tmp --export dead.tmp # Find and export dead redundant domains
[[ ! -s dead.tmp ]] && return # Return if no dead domains found
check_redundant() {
sed 's/^/||/; s/$/^/' "$REDUNDANT_DOMAINS" > formatted_redundant_domains.tmp

# Find and export dead redundant domains
dead-domains-linter -i formatted_redundant_domains.tmp --export dead.tmp
[[ ! -s dead.tmp ]] && return

# Remove dead redundant domains from redundant domains file
comm -23 "$redundant_domains_file" dead.tmp > redundant.tmp && mv redundant.tmp "$redundant_domains_file"
cat dead.tmp >> "$dead_domains_file" # Collate dead redundant domains to filter out from newly retrieved domains
format_list "$dead_domains_file"

while read -r wildcard; do # Loop through wildcards
# If no matches remaining, consider wildcard as dead
! grep -q "\.${wildcard}$" "$redundant_domains_file" && printf "%s\n" "$wildcard" >> collated_dead_wildcards.tmp
done < "$wildcards_file"
[[ ! -f collated_dead_wildcards.tmp ]] && return # Return if no unused wildcards found
comm -23 "$REDUNDANT_DOMAINS" dead.tmp > redundant.tmp
mv redundant.tmp "$REDUNDANT_DOMAINS"

# Cache dead redundant domains to filter out from newly retrieved domains
cat dead.tmp >> "$DEAD_DOMAINS"
format_list "$DEAD_DOMAINS"

# Find unused wildcard
while read -r wildcard; do
# If no matches, consider wildcard as unused/dead
! grep -q "\.${wildcard}$" "$REDUNDANT_DOMAINS" &&
printf "%s\n" "$wildcard" >> collated_dead_wildcards.tmp
done < "$WILDCARDS"
[[ ! -f collated_dead_wildcards.tmp ]] && return
sort -u collated_dead_wildcards.tmp -o collated_dead_wildcards.tmp

# Remove unused wildcards from raw file and wildcards file
comm -23 "$raw_file" collated_dead_wildcards.tmp > raw.tmp && mv raw.tmp "$raw_file"
comm -23 "$wildcards_file" collated_dead_wildcards.tmp > wildcards.tmp && mv wildcards.tmp "$wildcards_file"
log_event "$(<collated_dead_wildcards.tmp)" "dead" "wildcard"
comm -23 "$RAW" collated_dead_wildcards.tmp > raw.tmp
mv raw.tmp "$RAW"
comm -23 "$WILDCARDS" collated_dead_wildcards.tmp > wildcards.tmp
mv wildcards.tmp "$WILDCARDS"

log_event "$(<collated_dead_wildcards.tmp)" dead wildcard
}

function check_dead {
comm -23 "$raw_file" <(sort "$root_domains_file" "$wildcards_file") | # Exclude wildcards and root domains of subdomains
sed 's/^/||/; s/$/^/' > formatted_raw_file.tmp # Format raw file
dead-domains-linter -i formatted_raw_file.tmp --export dead_in_raw.tmp # Find and export dead domains
[[ ! -s dead_in_raw.tmp ]] && return # Return if no dead domains found
check_dead() {
# Exclude wildcards and root domains of subdomains
comm -23 "$RAW" <(sort "$ROOT_DOMAINS" "$WILDCARDS") |
sed 's/^/||/; s/$/^/' > formatted_raw.tmp

# Find and export dead domains
dead-domains-linter -i formatted_raw.tmp --export dead_in_raw.tmp
[[ ! -s dead_in_raw.tmp ]] && return

# Remove dead domains from raw file
comm -23 "$raw_file" dead_in_raw.tmp > raw.tmp && mv raw.tmp "$raw_file"
log_event "$(<dead_in_raw.tmp)" "dead" "raw"
comm -23 "$RAW" dead_in_raw.tmp > raw.tmp && mv raw.tmp "$RAW"

log_event "$(<dead_in_raw.tmp)" dead raw
}

function check_alive {
sed 's/^/||/; s/$/^/' "$dead_domains_file" > formatted_dead_domains_file.tmp # Format dead domains file
dead-domains-linter -i formatted_dead_domains_file.tmp --export dead.tmp # Find dead domains in the dead domains file
alive_domains=$(comm -23 <(sort "$dead_domains_file") <(sort dead.tmp)) # Find resurrected domains in dead domains file (note dead domains file is unsorted)
[[ -z "$alive_domains" ]] && return # Return if no resurrected domains found
cp dead.tmp "$dead_domains_file" # Update dead domains file to exclude resurrected domains

# Strip away subdomains from alive domains since subdomains are not supposed to be in raw file
while read -r subdomain; do # Loop through common subdomains
alive_domains=$(printf "%s" "$alive_domains" | sed "s/^${subdomain}\.//" | sort -u)
done < "$subdomains_to_remove_file"
printf "%s\n" "$alive_domains" >> "$raw_file" # Add resurrected domains to raw file
format_list "$dead_domains_file" && format_list "$raw_file"
log_event "$alive_domains" "resurrected" "dead_domains_file"
check_alive() {
sed 's/^/||/; s/$/^/' "$DEAD_DOMAINS" > formatted_dead_domains.tmp

# Find and export dead domains
dead-domains-linter -i formatted_dead_domains.tmp --export dead.tmp

# Find resurrected domains in dead domains file (note dead domains file is unsorted)
alive_domains="$(comm -23 <(sort "$DEAD_DOMAINS") <(sort dead.tmp))"
[[ -z "$alive_domains" ]] && return

# Update dead domains file to only include dead domains
cp dead.tmp "$DEAD_DOMAINS"
format_list "$DEAD_DOMAINS"

# Strip away subdomains from alive domains as subdomains are not supposed to be in raw file
while read -r subdomain; do
alive_domains="$(printf "%s" "$alive_domains" | sed "s/^${subdomain}\.//" | sort -u)"
done < "$SUBDOMAINS_TO_REMOVE"

printf "%s\n" "$alive_domains" >> "$RAW" # Add resurrected domains to raw file
format_list "$RAW"

log_event "$alive_domains" resurrected dead_domains
}

function update_light_file {
comm -12 "$raw_file" "$raw_light_file" > light.tmp && mv light.tmp "$raw_light_file" # Keep only domains found in full raw file
# Function 'update_light_file' removes any domains from the light raw file that
# are not found in the full raw file.
update_light_file() {
comm -12 "$RAW" "$RAW_LIGHT" > light.tmp && mv light.tmp "$RAW_LIGHT"
}

function prune_dead_domains_file {
[[ $(wc -l < "$dead_domains_file") -gt 5000 ]] && sed -i '1,100d' "$dead_domains_file" || printf "" # printf to negate exit status 1
# Function 'prune_dead_domains_file' removes old entries once the file reaches
# a threshold of entries.
prune_dead_domains_file() {
[[ $(wc -l < "$DEAD_DOMAINS") -gt 5000 ]] && sed -i '1,100d' "$DEAD_DOMAINS"
true
}

function log_event {
# Log domain events
printf "%s\n" "$1" | awk -v type="$2" -v source="$3" -v time="$time_format" '{print time "," type "," $0 "," source}' >> "$domain_log"
# Function 'log_event' logs domain processing events into the domain log
# $1: domains to log stored in a variable.
# $2: event type (dead, whitelisted, etc.)
# $3: source
log_event() {
printf "%s\n" "$1" | awk -v type="$2" -v source="$3" -v time="$(date -u +"%H:%M:%S %d-%m-%y")" \
'{print time "," type "," $0 "," source}' >> "$DOMAIN_LOG"
}

function format_list {
bash functions/tools.sh "format" "$1"
# Function 'format_file' is a shell wrapper to standardize the format of a file.
# $1: file to format
format_file() {
bash functions/tools.sh format "$1"
}

function cleanup {
cleanup() {
find . -maxdepth 1 -type f -name "*.tmp" -delete
prune_dead_domains_file
}
Expand Down
7 changes: 2 additions & 5 deletions functions/tools.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
#!/bin/bash

# Tools.sh is a shell wrapper intended to store commonly used functions.

# 'format' is called to standardize the format of a file.
# Function 'format' is called to standardize the format of a file.
format() {
local -r file="$1"

[[ ! -f "$file" ]] && return

# Applicable to all files:
Expand All @@ -21,8 +19,7 @@ format() {
;;
('config/parked_terms.txt')
# Remove empty lines, convert to lowercase, sort and remove duplicates
sed '/^$/d' "$file" | tr '[:upper:]' '[:lower:]' |
sort -u -o "${file}.tmp"
sed '/^$/d' "$file" | tr '[:upper:]' '[:lower:]' | sort -u -o "${file}.tmp"
mv "${file}.tmp" "$file"
;;
(*.txt|*.tmp)
Expand Down
16 changes: 8 additions & 8 deletions functions/update_readme.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#!/bin/bash
# This script updates README.md content and statistics
# This script updates README.md content and statistics.

readonly RAW='data/raw.txt'
readonly RAW_LIGHT='data/raw_light.txt'
readonly SEARCH_TERMS='config/search_terms.csv'
readonly SOURCE_LOG='config/SOURCE_LOG.csv'
readonly TODAY
readonly YESTERDAY
TODAY="$(date -u +"%d-%m-%y")"
YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")"
readonly TODAY
readonly YESTERDAY

update_readme() {
cat << EOF > README.md
Expand Down Expand Up @@ -135,15 +135,15 @@ EOF
}

# Function 'print_stats' prints the various statistics for each source
# $1: source to process (leave blank to process all sources)
# $1: source to process (leave blank to process all sources).
print_stats() {
[[ -n "$1" ]] && source="$1" || source='All sources'
printf "%5s |%10s |%8s%% | %s\n" "$(sum "$TODAY" "$1")" \
"$(sum "$YESTERDAY" "$1")" "$(sum_excluded "$1" )" "$source"
}

# Function 'sum' is an echo wrapper that sums up the domains retrieved by that source for
# that particular day
# Function 'sum' is an echo wrapper that sums up the domains retrieved by
# that source for that particular day.
# $1: day to process
# $2: source to process
sum() {
Expand All @@ -154,7 +154,7 @@ sum() {
}

# Function 'count_excluded' is an echo wrapper that counts the % of excluded domains
# of raw count retrieved from each source
# of raw count retrieved from each source.
# $1: source to process
count_excluded() {
csvgrep -c 2 -m "$1" "$SOURCE_LOG" | csvgrep -c 14 -m yes > source_rows.tmp
Expand All @@ -173,7 +173,7 @@ count_excluded() {
}

# Function 'format_file' is a shell wrapper to standardize the format of a file
# $1: file to format
# $1: file to format.
format_file() {
bash functions/tools.sh format "$1"
}
Expand Down
Loading

0 comments on commit 13d742d

Please sign in to comment.