From 048bcca40fb5a9c3477290cf4011a7fd39a65dfc Mon Sep 17 00:00:00 2001 From: Jarell <91372088+jarelllama@users.noreply.github.com> Date: Sun, 31 Mar 2024 13:05:46 +0800 Subject: [PATCH] Tidy build_lists.sh --- functions/build_lists.sh | 104 ++++++++++++++++++++++++++----------- functions/update_readme.sh | 17 +++--- functions/validate_raw.sh | 19 +++---- 3 files changed, 95 insertions(+), 45 deletions(-) diff --git a/functions/build_lists.sh b/functions/build_lists.sh index 120474d65..0a64125ff 100644 --- a/functions/build_lists.sh +++ b/functions/build_lists.sh @@ -1,23 +1,26 @@ #!/bin/bash -raw_file='data/raw.txt' -raw_light_file='data/raw_light.txt' +# This script builds the various formats of list from the raw files -function main { - build_adblock - build_dnsmasq - build_unbound - build_wildcard_asterisk - build_wildcard_domains -} +readonly RAW='data/raw.txt' +readonly RAW_LIGHT='data/raw_light.txt' -function build_lists { - [[ -z "$comment" ]] && comment='#' # Set default comment to '#' - mkdir -p "lists/${directory}" # Create directory if not present +# Function 'build_lists' builds the two version of the blocklist in the various formats +# called by the build list functions (see below) +build_lists() { + [[ -z "$comment" ]] && comment='#' # Set default comment character to '#' + mkdir -p "lists/${directory}" - # Loop through the two blocklist versions + # Loop through the full and light blocklist versions for i in {1..2}; do - [[ "$i" -eq 1 ]] && { list_name='scams.txt'; version=''; source_file="$raw_file"; } - [[ "$i" -eq 2 ]] && { list_name='scams_light.txt'; version='LIGHT VERSION'; source_file="$raw_light_file"; } + if [[ "$i" -eq 1 ]]; then + version='' + list_name='scams.txt' + source_file="$RAW" + elif [[ "$i" -eq 2 ]]; then + version='LIGHT VERSION' + list_name='scams_light.txt' + source_file="$RAW_LIGHT" + fi blocklist_path="lists/${directory}/${list_name}" cat << EOF > "$blocklist_path" # Append header onto blocklist @@ -31,39 +34,80 @@ ${comment} Total number of entries: $(wc -l < "$source_file") ${comment} EOF - [[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path" # Special case for Unbound format + # Special case for Unbound format + [[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path" # Append formatted domains onto blocklist - printf "%s\n" "$(awk -v before="$before" -v after="$after" '{print before $0 after}' "$source_file")" >> "$blocklist_path" + printf "%s\n" "$(awk -v before="$before" -v after="$after" \ + '{print before $0 after}' "$source_file")" >> "$blocklist_path" done } -function format_list { - bash functions/tools.sh "format" "$1" +# Function 'format_file' is a shell wrapper to standardize the format of a file +# $1: file to format +format_file() { + bash functions/tools.sh format "$1" } -function build_adblock { - syntax='Adblock Plus' && directory='adblock' && comment='!' && before='||' && after='^' +# Build list functions: +# $syntax: name of list syntax +# $directory: directory to create list in +# $comment: character used for comments (default:#) +# $before: characters to append before each domain +# $after: characters to append after each domain + +build_adblock() { + local syntax='Adblock Plus' + local directory='adblock' + local comment='!' + local before='||' + local after='^' build_lists } -function build_dnsmasq { - syntax='Dnsmasq' && directory='dnsmasq' && comment='' && before='local=/' && after='/' +build_dnsmasq() { + local syntax='Dnsmasq' + local directory='dnsmasq' + local comment='' + local before='local=/' + local after='/' build_lists } -function build_unbound { - syntax='Unbound' && directory='unbound' && comment='' && before='local-zone: "' && after='." always_nxdomain' +build_unbound() { + local syntax='Unbound' + local directory='unbound' + local comment='' + local before='local-zone: "' + local after='." always_nxdomain' build_lists } -function build_wildcard_asterisk { - syntax='Wildcard Asterisk' && directory='wildcard_asterisk' && comment='' && before='*.' && after='' +build_wildcard_asterisk() { + local syntax='Wildcard Asterisk' + local directory='wildcard_asterisk' + local comment='' + local before='*.' + local after='' build_lists } -function build_wildcard_domains { - syntax='Wildcard Domains' && directory='wildcard_domains' && comment='' && before='' && after='' +build_wildcard_domains() { + local syntax='Wildcard Domains' + local directory='wildcard_domains' + local comment='' + local before='' + local after='' build_lists } -main +# Entry point + +for file in config/* data/*; do + format_file "$file" +done + +build_adblock +build_dnsmasq +build_unbound +build_wildcard_asterisk +build_wildcard_domains \ No newline at end of file diff --git a/functions/update_readme.sh b/functions/update_readme.sh index cc7009145..8723ffde6 100644 --- a/functions/update_readme.sh +++ b/functions/update_readme.sh @@ -1,12 +1,14 @@ #!/bin/bash +# This script updates README.md content and statistics + readonly RAW='data/raw.txt' readonly RAW_LIGHT='data/raw_light.txt' readonly SEARCH_TERMS='config/search_terms.csv' readonly SOURCE_LOG='config/SOURCE_LOG.csv' -TODAY="$(date -u +"%d-%m-%y")" -YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")" readonly TODAY readonly YESTERDAY +TODAY="$(date -u +"%d-%m-%y")" +YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")" update_readme() { cat << EOF > README.md @@ -142,7 +144,8 @@ print_stats() { # Function 'sum' is an echo wrapper that sums up the domains retrieved by that source for # that particular day -# $1: source to count +# $1: day to process +# $2: source to process sum() { # Print dash if no runs for that day found ! grep -qF "$1" "$SOURCE_LOG" && { printf "-"; return; } @@ -152,7 +155,7 @@ sum() { # Function 'count_excluded' is an echo wrapper that counts the % of excluded domains # of raw count retrieved from each source -# $1: source to count +# $1: source to process count_excluded() { csvgrep -c 2 -m "$1" "$SOURCE_LOG" | csvgrep -c 14 -m yes > source_rows.tmp @@ -163,10 +166,10 @@ count_excluded() { dead_count="$(csvcut -c 7 source_rows.tmp | awk '{total += $1} END {print total}')" redundant_count="$(csvcut -c 8 source_rows.tmp | awk '{total += $1} END {print total}')" parked_count="$(csvcut -c 9 source_rows.tmp | awk '{total += $1} END {print total}')" + rm source_rows.tmp + excluded_count="$((white_count + dead_count + redundant_count + parked_count))" printf "%s" "$((excluded_count*100/raw_count))" - - rm source_rows.tmp } # Function 'format_file' is a shell wrapper to standardize the format of a file @@ -175,6 +178,8 @@ format_file() { bash functions/tools.sh format "$1" } +# Entry point + command -v csvgrep &> /dev/null || pip install -q csvkit # Install csvkit for file in config/* data/*; do diff --git a/functions/validate_raw.sh b/functions/validate_raw.sh index 1d00fad55..ec869443b 100644 --- a/functions/validate_raw.sh +++ b/functions/validate_raw.sh @@ -1,6 +1,5 @@ #!/bin/bash - -# Validates the domains in the raw file via a variety of checks +# This script validates the domains in the raw file via a variety of checks readonly RAW='data/raw.txt' readonly RAW_LIGHT='data/raw_light.txt' @@ -13,8 +12,8 @@ readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt' readonly WILDCARDS='data/wildcards.txt' readonly REDUNDANT_DOMAINS='data/redundant_domains.txt' readonly DOMAIN_LOG='config/domain_log.csv' -TIME_FORMAT="$(date -u +"%H:%M:%S %d-%m-%y")" readonly TIME_FORMAT +TIME_FORMAT="$(date -u +"%H:%M:%S %d-%m-%y")" # Function 'validate_raw' stores the domains in the raw file in a variable and validates them # via a variety of checks @@ -41,7 +40,7 @@ validate_raw() { printf "%s\n" "$domains_with_subdomains" | sed "s/^${subdomain}\.//" >> root_domains.tmp awk '{print $0 " (subdomain)"}' <<< "$domains_with_subdomains" >> filter_log.tmp - log_event "$domains_with_subdomains" "subdomain" + log_event "$domains_with_subdomains" subdomain done < "$SUBDOMAINS_TO_REMOVE" format_file subdomains.tmp format_file root_domains.tmp @@ -52,7 +51,7 @@ validate_raw() { if [[ "$whitelisted_count" -gt 0 ]]; then domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$whitelisted_domains"))" awk '{print $0 " (whitelisted)"}' <<< "$whitelisted_domains" >> filter_log.tmp - log_event "$whitelisted_domains" "whitelist" + log_event "$whitelisted_domains" whitelist fi # Remove domains that have whitelisted TLDs @@ -61,7 +60,7 @@ validate_raw() { if [[ "$whitelisted_tld_count" -gt 0 ]]; then domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$whitelisted_tld_domains"))" awk '{print $0 " (whitelisted TLD)"}' <<< "$whitelisted_tld_domains" >> filter_log.tmp - log_event "$whitelisted_tld_domains" "tld" + log_event "$whitelisted_tld_domains" tld fi # Remove invalid entries including IP addresses. This excludes punycode TLDs (.xn--*) @@ -70,7 +69,7 @@ validate_raw() { if [[ "$invalid_entries_count" -gt 0 ]]; then domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$invalid_entries"))" awk '{print $0 " (invalid)"}' <<< "$invalid_entries" >> filter_log.tmp - log_event "$invalid_entries" "invalid" + log_event "$invalid_entries" invalid fi # Remove redundant domains @@ -91,7 +90,7 @@ validate_raw() { printf "%s\n" "$domain" >> wildcards.tmp awk '{print $0 " (redundant)"}' <<< "$redundant_domains" >> filter_log.tmp - log_event "$redundant_domains" "redundant" + log_event "$redundant_domains" redundant done <<< "$domains" format_file redundant_domains.tmp format_file wildcards.tmp @@ -102,7 +101,7 @@ validate_raw() { if [[ "$toplist_count" -gt 0 ]]; then awk '{print $0 " (toplist) - \033[1;31mmanual removal required\033[0m"}' \ <<< "$domains_in_toplist" >> filter_log.tmp - log_event "$domains_in_toplist" "toplist" + log_event "$domains_in_toplist" toplist fi # Exit if no filtering done @@ -167,6 +166,8 @@ format_file() { bash functions/tools.sh format "$1" } +# Entry point + trap 'find . -maxdepth 1 -type f -name "*.tmp" -delete' EXIT for file in config/* data/*; do