Skip to content

Commit

Permalink
Tidy build_lists.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
jarelllama authored Mar 31, 2024
1 parent ef011cf commit 048bcca
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 45 deletions.
104 changes: 74 additions & 30 deletions functions/build_lists.sh
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
#!/bin/bash
raw_file='data/raw.txt'
raw_light_file='data/raw_light.txt'
# This script builds the various formats of list from the raw files

function main {
build_adblock
build_dnsmasq
build_unbound
build_wildcard_asterisk
build_wildcard_domains
}
readonly RAW='data/raw.txt'
readonly RAW_LIGHT='data/raw_light.txt'

function build_lists {
[[ -z "$comment" ]] && comment='#' # Set default comment to '#'
mkdir -p "lists/${directory}" # Create directory if not present
# Function 'build_lists' builds the two version of the blocklist in the various formats
# called by the build list functions (see below)
build_lists() {
[[ -z "$comment" ]] && comment='#' # Set default comment character to '#'
mkdir -p "lists/${directory}"

# Loop through the two blocklist versions
# Loop through the full and light blocklist versions
for i in {1..2}; do
[[ "$i" -eq 1 ]] && { list_name='scams.txt'; version=''; source_file="$raw_file"; }
[[ "$i" -eq 2 ]] && { list_name='scams_light.txt'; version='LIGHT VERSION'; source_file="$raw_light_file"; }
if [[ "$i" -eq 1 ]]; then
version=''
list_name='scams.txt'
source_file="$RAW"
elif [[ "$i" -eq 2 ]]; then
version='LIGHT VERSION'
list_name='scams_light.txt'
source_file="$RAW_LIGHT"
fi
blocklist_path="lists/${directory}/${list_name}"

cat << EOF > "$blocklist_path" # Append header onto blocklist
Expand All @@ -31,39 +34,80 @@ ${comment} Total number of entries: $(wc -l < "$source_file")
${comment}
EOF

[[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path" # Special case for Unbound format
# Special case for Unbound format
[[ "$syntax" == 'Unbound' ]] && printf "server:\n" >> "$blocklist_path"
# Append formatted domains onto blocklist
printf "%s\n" "$(awk -v before="$before" -v after="$after" '{print before $0 after}' "$source_file")" >> "$blocklist_path"
printf "%s\n" "$(awk -v before="$before" -v after="$after" \
'{print before $0 after}' "$source_file")" >> "$blocklist_path"
done
}

function format_list {
bash functions/tools.sh "format" "$1"
# Function 'format_file' is a shell wrapper to standardize the format of a file
# $1: file to format
format_file() {
bash functions/tools.sh format "$1"
}

function build_adblock {
syntax='Adblock Plus' && directory='adblock' && comment='!' && before='||' && after='^'
# Build list functions:
# $syntax: name of list syntax
# $directory: directory to create list in
# $comment: character used for comments (default:#)
# $before: characters to append before each domain
# $after: characters to append after each domain

build_adblock() {
local syntax='Adblock Plus'
local directory='adblock'
local comment='!'
local before='||'
local after='^'
build_lists
}

function build_dnsmasq {
syntax='Dnsmasq' && directory='dnsmasq' && comment='' && before='local=/' && after='/'
build_dnsmasq() {
local syntax='Dnsmasq'
local directory='dnsmasq'
local comment=''
local before='local=/'
local after='/'
build_lists
}

function build_unbound {
syntax='Unbound' && directory='unbound' && comment='' && before='local-zone: "' && after='." always_nxdomain'
build_unbound() {
local syntax='Unbound'
local directory='unbound'
local comment=''
local before='local-zone: "'
local after='." always_nxdomain'
build_lists
}

function build_wildcard_asterisk {
syntax='Wildcard Asterisk' && directory='wildcard_asterisk' && comment='' && before='*.' && after=''
build_wildcard_asterisk() {
local syntax='Wildcard Asterisk'
local directory='wildcard_asterisk'
local comment=''
local before='*.'
local after=''
build_lists
}

function build_wildcard_domains {
syntax='Wildcard Domains' && directory='wildcard_domains' && comment='' && before='' && after=''
build_wildcard_domains() {
local syntax='Wildcard Domains'
local directory='wildcard_domains'
local comment=''
local before=''
local after=''
build_lists
}

main
# Entry point

for file in config/* data/*; do
format_file "$file"
done

build_adblock
build_dnsmasq
build_unbound
build_wildcard_asterisk
build_wildcard_domains
17 changes: 11 additions & 6 deletions functions/update_readme.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#!/bin/bash
# This script updates README.md content and statistics

readonly RAW='data/raw.txt'
readonly RAW_LIGHT='data/raw_light.txt'
readonly SEARCH_TERMS='config/search_terms.csv'
readonly SOURCE_LOG='config/SOURCE_LOG.csv'
TODAY="$(date -u +"%d-%m-%y")"
YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")"
readonly TODAY
readonly YESTERDAY
TODAY="$(date -u +"%d-%m-%y")"
YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")"

update_readme() {
cat << EOF > README.md
Expand Down Expand Up @@ -142,7 +144,8 @@ print_stats() {

# Function 'sum' is an echo wrapper that sums up the domains retrieved by that source for
# that particular day
# $1: source to count
# $1: day to process
# $2: source to process
sum() {
# Print dash if no runs for that day found
! grep -qF "$1" "$SOURCE_LOG" && { printf "-"; return; }
Expand All @@ -152,7 +155,7 @@ sum() {

# Function 'count_excluded' is an echo wrapper that counts the % of excluded domains
# of raw count retrieved from each source
# $1: source to count
# $1: source to process
count_excluded() {
csvgrep -c 2 -m "$1" "$SOURCE_LOG" | csvgrep -c 14 -m yes > source_rows.tmp

Expand All @@ -163,10 +166,10 @@ count_excluded() {
dead_count="$(csvcut -c 7 source_rows.tmp | awk '{total += $1} END {print total}')"
redundant_count="$(csvcut -c 8 source_rows.tmp | awk '{total += $1} END {print total}')"
parked_count="$(csvcut -c 9 source_rows.tmp | awk '{total += $1} END {print total}')"
rm source_rows.tmp

excluded_count="$((white_count + dead_count + redundant_count + parked_count))"
printf "%s" "$((excluded_count*100/raw_count))"

rm source_rows.tmp
}

# Function 'format_file' is a shell wrapper to standardize the format of a file
Expand All @@ -175,6 +178,8 @@ format_file() {
bash functions/tools.sh format "$1"
}

# Entry point

command -v csvgrep &> /dev/null || pip install -q csvkit # Install csvkit

for file in config/* data/*; do
Expand Down
19 changes: 10 additions & 9 deletions functions/validate_raw.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/bin/bash

# Validates the domains in the raw file via a variety of checks
# This script validates the domains in the raw file via a variety of checks

readonly RAW='data/raw.txt'
readonly RAW_LIGHT='data/raw_light.txt'
Expand All @@ -13,8 +12,8 @@ readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt'
readonly WILDCARDS='data/wildcards.txt'
readonly REDUNDANT_DOMAINS='data/redundant_domains.txt'
readonly DOMAIN_LOG='config/domain_log.csv'
TIME_FORMAT="$(date -u +"%H:%M:%S %d-%m-%y")"
readonly TIME_FORMAT
TIME_FORMAT="$(date -u +"%H:%M:%S %d-%m-%y")"

# Function 'validate_raw' stores the domains in the raw file in a variable and validates them
# via a variety of checks
Expand All @@ -41,7 +40,7 @@ validate_raw() {
printf "%s\n" "$domains_with_subdomains" | sed "s/^${subdomain}\.//" >> root_domains.tmp

awk '{print $0 " (subdomain)"}' <<< "$domains_with_subdomains" >> filter_log.tmp
log_event "$domains_with_subdomains" "subdomain"
log_event "$domains_with_subdomains" subdomain
done < "$SUBDOMAINS_TO_REMOVE"
format_file subdomains.tmp
format_file root_domains.tmp
Expand All @@ -52,7 +51,7 @@ validate_raw() {
if [[ "$whitelisted_count" -gt 0 ]]; then
domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$whitelisted_domains"))"
awk '{print $0 " (whitelisted)"}' <<< "$whitelisted_domains" >> filter_log.tmp
log_event "$whitelisted_domains" "whitelist"
log_event "$whitelisted_domains" whitelist
fi

# Remove domains that have whitelisted TLDs
Expand All @@ -61,7 +60,7 @@ validate_raw() {
if [[ "$whitelisted_tld_count" -gt 0 ]]; then
domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$whitelisted_tld_domains"))"
awk '{print $0 " (whitelisted TLD)"}' <<< "$whitelisted_tld_domains" >> filter_log.tmp
log_event "$whitelisted_tld_domains" "tld"
log_event "$whitelisted_tld_domains" tld
fi

# Remove invalid entries including IP addresses. This excludes punycode TLDs (.xn--*)
Expand All @@ -70,7 +69,7 @@ validate_raw() {
if [[ "$invalid_entries_count" -gt 0 ]]; then
domains="$(comm -23 <(printf "%s" "$domains") <(printf "%s" "$invalid_entries"))"
awk '{print $0 " (invalid)"}' <<< "$invalid_entries" >> filter_log.tmp
log_event "$invalid_entries" "invalid"
log_event "$invalid_entries" invalid
fi

# Remove redundant domains
Expand All @@ -91,7 +90,7 @@ validate_raw() {
printf "%s\n" "$domain" >> wildcards.tmp

awk '{print $0 " (redundant)"}' <<< "$redundant_domains" >> filter_log.tmp
log_event "$redundant_domains" "redundant"
log_event "$redundant_domains" redundant
done <<< "$domains"
format_file redundant_domains.tmp
format_file wildcards.tmp
Expand All @@ -102,7 +101,7 @@ validate_raw() {
if [[ "$toplist_count" -gt 0 ]]; then
awk '{print $0 " (toplist) - \033[1;31mmanual removal required\033[0m"}' \
<<< "$domains_in_toplist" >> filter_log.tmp
log_event "$domains_in_toplist" "toplist"
log_event "$domains_in_toplist" toplist
fi

# Exit if no filtering done
Expand Down Expand Up @@ -167,6 +166,8 @@ format_file() {
bash functions/tools.sh format "$1"
}

# Entry point

trap 'find . -maxdepth 1 -type f -name "*.tmp" -delete' EXIT

for file in config/* data/*; do
Expand Down

0 comments on commit 048bcca

Please sign in to comment.