From 048bcca40fb5a9c3477290cf4011a7fd39a65dfc Mon Sep 17 00:00:00 2001
From: Jarell <91372088+jarelllama@users.noreply.github.com>
Date: Sun, 31 Mar 2024 13:05:46 +0800
Subject: [PATCH] Tidy build_lists.sh

---
 functions/build_lists.sh   | 104 ++++++++++++++++++++++++++-----------
 functions/update_readme.sh |  17 +++---
 functions/validate_raw.sh  |  19 +++----
 3 files changed, 95 insertions(+), 45 deletions(-)

diff --git a/functions/build_lists.sh b/functions/build_lists.sh
index 120474d65..0a64125ff 100644
--- a/functions/build_lists.sh
+++ b/functions/build_lists.sh
@@ -1,23 +1,26 @@
 #!/bin/bash
-raw_file='data/raw.txt'
-raw_light_file='data/raw_light.txt'
+# This script builds the various formats of list from the raw files
 
-function main {
-    build_adblock
-    build_dnsmasq
-    build_unbound
-    build_wildcard_asterisk
-    build_wildcard_domains
-}
+readonly RAW='data/raw.txt'
+readonly RAW_LIGHT='data/raw_light.txt'
 
-function build_lists {
-    [[ -z "$comment" ]] && comment='#'  # Set default comment to '#'
-    mkdir -p "lists/${directory}"  # Create directory if not present
+# Function 'build_lists' builds the two version of the blocklist in the various formats
+# called by the build list functions (see below)
+build_lists() {
+    [[ -z "$comment" ]] && comment='#'  # Set default comment character to '#'
+    mkdir -p "lists/${directory}"
 
-    # Loop through the two blocklist versions
+    # Loop through the full and light blocklist versions
     for i in {1..2}; do
-        [[ "$i" -eq 1 ]] && { list_name='scams.txt'; version=''; source_file="$raw_file"; }
-        [[ "$i" -eq 2 ]] && { list_name='scams_light.txt'; version='LIGHT VERSION'; source_file="$raw_light_file"; }
+        if [[ "$i" -eq 1 ]]; then
+            version=''
+            list_name='scams.txt'
+            source_file="$RAW"
+        elif [[ "$i" -eq 2 ]]; then
+            version='LIGHT VERSION'
+            list_name='scams_light.txt'
+            source_file="$RAW_LIGHT"
+        fi
         blocklist_path="lists/${directory}/${list_name}"
 
         cat << EOF > "$blocklist_path"  # Append header onto blocklist
@@ -31,39 +34,80 @@ ${comment} Total number of entries: $(wc -l < "$source_file")
 ${comment}
 EOF
 
-        [[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path"  # Special case for Unbound format
+        # Special case for Unbound format
+        [[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path"
         # Append formatted domains onto blocklist
-        printf "%s\n" "$(awk -v before="$before" -v after="$after" '{print before $0 after}' "$source_file")" >> "$blocklist_path"
+        printf "%s\n" "$(awk -v before="$before" -v after="$after" \
+            '{print before $0 after}' "$source_file")" >> "$blocklist_path"
     done
 }
 
-function format_list {
-    bash functions/tools.sh "format" "$1"
+# Function 'format_file' is a shell wrapper to standardize the format of a file
+# $1: file to format
+format_file() {
+    bash functions/tools.sh format "$1"
 }
 
-function build_adblock {
-    syntax='Adblock Plus' && directory='adblock' && comment='!' && before='||' && after='^'
+# Build list functions:
+# $syntax: name of list syntax
+# $directory: directory to create list in
+# $comment: character used for comments (default:#)
+# $before: characters to append before each domain
+# $after: characters to append after each domain
+
+build_adblock() {
+    local syntax='Adblock Plus'
+    local directory='adblock'
+    local comment='!'
+    local before='||'
+    local after='^'
     build_lists
 }
 
-function build_dnsmasq {
-    syntax='Dnsmasq' && directory='dnsmasq' && comment='' && before='local=/' && after='/'
+build_dnsmasq() {
+    local syntax='Dnsmasq'
+    local directory='dnsmasq'
+    local comment=''
+    local before='local=/'
+    local after='/'
     build_lists
 }
 
-function build_unbound {
-    syntax='Unbound' && directory='unbound' && comment='' && before='local-zone: "' && after='." always_nxdomain'
+build_unbound() {
+    local syntax='Unbound'
+    local directory='unbound'
+    local comment=''
+    local before='local-zone: "'
+    local after='." always_nxdomain'
     build_lists
 }
 
-function build_wildcard_asterisk {
-    syntax='Wildcard Asterisk' && directory='wildcard_asterisk' && comment='' && before='*.' && after=''
+build_wildcard_asterisk() {
+    local syntax='Wildcard Asterisk'
+    local directory='wildcard_asterisk'
+    local comment=''
+    local before='*.'
+    local after=''
     build_lists
 }
 
-function build_wildcard_domains {
-    syntax='Wildcard Domains' && directory='wildcard_domains' && comment='' && before='' && after=''
+build_wildcard_domains() {
+    local syntax='Wildcard Domains'
+    local directory='wildcard_domains'
+    local comment=''
+    local before=''
+    local after=''
     build_lists
 }
 
-main
+# Entry point
+
+for file in config/* data/*; do
+    format_file "$file"
+done
+
+build_adblock
+build_dnsmasq
+build_unbound
+build_wildcard_asterisk
+build_wildcard_domains
\ No newline at end of file
diff --git a/functions/update_readme.sh b/functions/update_readme.sh
index cc7009145..8723ffde6 100644
--- a/functions/update_readme.sh
+++ b/functions/update_readme.sh
@@ -1,12 +1,14 @@
 #!/bin/bash
+# This script updates README.md content and statistics
+
 readonly RAW='data/raw.txt'
 readonly RAW_LIGHT='data/raw_light.txt'
 readonly SEARCH_TERMS='config/search_terms.csv'
 readonly SOURCE_LOG='config/SOURCE_LOG.csv'
-TODAY="$(date -u +"%d-%m-%y")"
-YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")"
 readonly TODAY
 readonly YESTERDAY
+TODAY="$(date -u +"%d-%m-%y")"
+YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")"
 
 update_readme() {
     cat << EOF > README.md
@@ -142,7 +144,8 @@ print_stats() {
 
 # Function 'sum' is an echo wrapper that sums up the domains retrieved by that source for
 # that particular day
-# $1: source to count
+# $1: day to process
+# $2: source to process
 sum() {
     # Print dash if no runs for that day found
     ! grep -qF "$1" "$SOURCE_LOG" && { printf "-"; return; }
@@ -152,7 +155,7 @@ sum() {
 
 # Function 'count_excluded' is an echo wrapper that counts the % of excluded domains
 # of raw count retrieved from each source
-# $1: source to count
+# $1: source to process
 count_excluded() {
     csvgrep -c 2 -m "$1" "$SOURCE_LOG" | csvgrep -c 14 -m yes > source_rows.tmp
 
@@ -163,10 +166,10 @@ count_excluded() {
     dead_count="$(csvcut -c 7 source_rows.tmp | awk '{total += $1} END {print total}')"
     redundant_count="$(csvcut -c 8 source_rows.tmp | awk '{total += $1} END {print total}')"
     parked_count="$(csvcut -c 9 source_rows.tmp | awk '{total += $1} END {print total}')"
+    rm source_rows.tmp
+
     excluded_count="$((white_count + dead_count + redundant_count + parked_count))"
     printf "%s" "$((excluded_count*100/raw_count))"
-
-    rm source_rows.tmp
 }
 
 # Function 'format_file' is a shell wrapper to standardize the format of a file
@@ -175,6 +178,8 @@ format_file() {
     bash functions/tools.sh format "$1"
 }
 
+# Entry point
+
 command -v csvgrep &> /dev/null || pip install -q csvkit  # Install csvkit
 
 for file in config/* data/*; do
diff --git a/functions/validate_raw.sh b/functions/validate_raw.sh
index 1d00fad55..ec869443b 100644
--- a/functions/validate_raw.sh
+++ b/functions/validate_raw.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-
-# Validates the domains in the raw file via a variety of checks
+# This script validates the domains in the raw file via a variety of checks
 
 readonly RAW='data/raw.txt'
 readonly RAW_LIGHT='data/raw_light.txt'
@@ -13,8 +12,8 @@ readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt'
 readonly WILDCARDS='data/wildcards.txt'
 readonly REDUNDANT_DOMAINS='data/redundant_domains.txt'
 readonly DOMAIN_LOG='config/domain_log.csv'
-TIME_FORMAT="$(date -u +"%H:%M:%S %d-%m-%y")"
 readonly TIME_FORMAT
+TIME_FORMAT="$(date -u +"%H:%M:%S %d-%m-%y")"
 
 # Function 'validate_raw' stores the domains in the raw file in a variable and validates them
 # via a variety of checks
@@ -41,7 +40,7 @@ validate_raw() {
         printf "%s\n" "$domains_with_subdomains" | sed "s/^${subdomain}\.//" >> root_domains.tmp
 
         awk '{print $0 " (subdomain)"}' <<< "$domains_with_subdomains" >> filter_log.tmp
-        log_event "$domains_with_subdomains" "subdomain"
+        log_event "$domains_with_subdomains" subdomain
     done < "$SUBDOMAINS_TO_REMOVE"
     format_file subdomains.tmp
     format_file root_domains.tmp
@@ -52,7 +51,7 @@ validate_raw() {
     if [[ "$whitelisted_count" -gt 0 ]]; then
         domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$whitelisted_domains"))"
         awk '{print $0 " (whitelisted)"}' <<< "$whitelisted_domains" >> filter_log.tmp
-        log_event "$whitelisted_domains" "whitelist"
+        log_event "$whitelisted_domains" whitelist
     fi
 
     # Remove domains that have whitelisted TLDs
@@ -61,7 +60,7 @@ validate_raw() {
     if [[ "$whitelisted_tld_count" -gt 0 ]]; then
         domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$whitelisted_tld_domains"))"
         awk '{print $0 " (whitelisted TLD)"}' <<< "$whitelisted_tld_domains" >> filter_log.tmp
-        log_event "$whitelisted_tld_domains" "tld"
+        log_event "$whitelisted_tld_domains" tld
     fi
 
     # Remove invalid entries including IP addresses. This excludes punycode TLDs (.xn--*)
@@ -70,7 +69,7 @@ validate_raw() {
     if [[ "$invalid_entries_count" -gt 0 ]]; then
         domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$invalid_entries"))"
         awk '{print $0 " (invalid)"}' <<< "$invalid_entries" >> filter_log.tmp
-        log_event "$invalid_entries" "invalid"
+        log_event "$invalid_entries" invalid
     fi
 
     # Remove redundant domains
@@ -91,7 +90,7 @@ validate_raw() {
         printf "%s\n" "$domain" >> wildcards.tmp
 
         awk '{print $0 " (redundant)"}' <<< "$redundant_domains" >> filter_log.tmp
-        log_event "$redundant_domains" "redundant"
+        log_event "$redundant_domains" redundant
     done <<< "$domains"
     format_file redundant_domains.tmp
     format_file wildcards.tmp
@@ -102,7 +101,7 @@ validate_raw() {
     if [[ "$toplist_count" -gt 0 ]]; then
         awk '{print $0 " (toplist) - \033[1;31mmanual removal required\033[0m"}' \
             <<< "$domains_in_toplist" >> filter_log.tmp
-        log_event "$domains_in_toplist" "toplist"
+        log_event "$domains_in_toplist" toplist
     fi
 
     # Exit if no filtering done
@@ -167,6 +166,8 @@ format_file() {
     bash functions/tools.sh format "$1"
 }
 
+# Entry point
+
 trap 'find . -maxdepth 1 -type f -name "*.tmp" -delete' EXIT
 
 for file in config/* data/*; do