diff --git a/functions/update_readme.sh b/functions/update_readme.sh index 759d99977..cc7009145 100644 --- a/functions/update_readme.sh +++ b/functions/update_readme.sh @@ -4,19 +4,11 @@ readonly RAW_LIGHT='data/raw_light.txt' readonly SEARCH_TERMS='config/search_terms.csv' readonly SOURCE_LOG='config/SOURCE_LOG.csv' TODAY="$(date -u +"%d-%m-%y")" -readonly TODAY YESTERDAY="$(date -ud "yesterday" +"%d-%m-%y")" +readonly TODAY readonly YESTERDAY -function main { - command -v csvgrep &> /dev/null || pip install -q csvkit # Install csvkit - for file in config/* data/*; do # Format files in the config and data directory - format_list "$file" - done - update_readme -} - -function update_readme { +update_readme() { cat << EOF > README.md # Jarelllama's Scam Blocklist Blocklist for scam site domains automatically retrieved daily from Google Search and public databases. Automated retrieval is done daily at 00:00 UTC. @@ -32,19 +24,19 @@ Blocklist for scam site domains automatically retrieved daily from Google Search [![Build and deploy](https://github.com/jarelllama/Scam-Blocklist/actions/workflows/build_deploy.yml/badge.svg)](https://github.com/jarelllama/Scam-Blocklist/actions/workflows/build_deploy.yml) [![Test functions](https://github.com/jarelllama/Scam-Blocklist/actions/workflows/test_functions.yml/badge.svg)](https://github.com/jarelllama/Scam-Blocklist/actions/workflows/test_functions.yml) \`\`\` -Total domains: "$(wc -l < "$RAW")" +Total domains: $(wc -l < "$RAW") Statistics for each source: Today | Yesterday | Excluded | Source -$(print_stats "Google Search") -$(print_stats "aa419.org") -$(print_stats "dfpi.ca.gov") -$(print_stats "guntab.com") -$(print_stats "petscams.com") -$(print_stats "scam.directory") -$(print_stats "scamadviser.com") -$(print_stats "stopgunscams.com") -$(print_stats "Manual") Entries +$(print_stats 'Google Search') +$(print_stats 'aa419.org') +$(print_stats 'dfpi.ca.gov') +$(print_stats 'guntab.com') +$(print_stats 'petscams.com') +$(print_stats 'scam.directory') +$(print_stats 'scamadviser.com') +$(print_stats 'stopgunscams.com') +$(print_stats 'Manual') Entries $(print_stats) *The Excluded % is of domains not included in the @@ -68,7 +60,7 @@ Targeted at list maintainers, a light version of the blocklist is available in t Sources excluded from the light version are marked in SOURCES.md.

-Total domains: "$(wc -l < "$RAW_LIGHT")" +Total domains: $(wc -l < "$RAW_LIGHT") ## Retrieving scam domains from Google Search @@ -86,9 +78,9 @@ To optimize the number of search queries made, each search term is frequently be #### Statistics for Google Search source \`\`\` -Active search terms: "$(csvgrep -c 2 -m 'y' -i "$SEARCH_TERMS" | tail -n +2 | wc -l)" -Queries made TODAY: "$(csvgrep -c 1 -m "$TODAY" "$SOURCE_LOG" | csvgrep -c 2 -m 'Google Search' | csvcut -c 12 | awk '{total += $1} END {print total}')" -Domains retrieved TODAY: "$(count "$TODAY" "Google Search")" +Active search terms: $(csvgrep -c 2 -m 'y' -i "$SEARCH_TERMS" | tail -n +2 | wc -l) +Queries made TODAY: $(csvgrep -c 1 -m "$TODAY" "$SOURCE_LOG" | csvgrep -c 2 -m 'Google Search' | csvcut -c 12 | awk '{total += $1} END {print total}') +Domains retrieved TODAY: $(count "$TODAY" 'Google Search') \`\`\` #### Regarding other sources @@ -140,34 +132,53 @@ Thanks to the following people for the help, inspiration, and support! EOF } -function print_stats { - [[ -n "$1" ]] && source="$1" || source="All sources" - printf "%5s |%10s |%8s%% | %s\n" "$(count "$TODAY" "$1")" "$(count "$YESTERDAY" "$1")" "$(count_excluded "$1" )" "$source" +# Function 'print_stats' prints the various statistics for each source +# $1: source to process (leave blank to process all sources) +print_stats() { + [[ -n "$1" ]] && source="$1" || source='All sources' + printf "%5s |%10s |%8s%% | %s\n" "$(sum "$TODAY" "$1")" \ + "$(sum "$YESTERDAY" "$1")" "$(sum_excluded "$1" )" "$source" } -function count { - # Sum up all domains retrieved by that source for that day - ! grep -qF "$1" "$SOURCE_LOG" && { printf "-"; return; } # Print dash if no runs for that day found - csvgrep -c 1 -m "$1" "$SOURCE_LOG" | csvgrep -c 2 -m "$2" | csvgrep -c 14 -m 'yes' | csvcut -c 5 | awk '{total += $1} END {print total}' +# Function 'sum' is an echo wrapper that sums up the domains retrieved by that source for +# that particular day +# $1: source to count +sum() { + # Print dash if no runs for that day found + ! grep -qF "$1" "$SOURCE_LOG" && { printf "-"; return; } + csvgrep -c 1 -m "$1" "$SOURCE_LOG" | csvgrep -c 2 -m "$2" | csvgrep -c 14 -m yes | + csvcut -c 5 | awk '{total += $1} END {print total}' } -function count_excluded { - source="$1" - # Count % of excluded domains of raw count retrieved from each source - csvgrep -c 2 -m "$source" "$SOURCE_LOG" | csvgrep -c 14 -m 'yes' > source_rows.tmp +# Function 'count_excluded' is an echo wrapper that counts the % of excluded domains +# of raw count retrieved from each source +# $1: source to count +count_excluded() { + csvgrep -c 2 -m "$1" "$SOURCE_LOG" | csvgrep -c 14 -m yes > source_rows.tmp + raw_count="$(csvcut -c 4 source_rows.tmp | awk '{total += $1} END {print total}')" - [[ "$raw_count" -eq 0 ]] && { printf "0"; return; } # Return if raw count is 0 to avoid divide by zero error + # Return if raw count is 0 to avoid divide by zero error + [[ "$raw_count" -eq 0 ]] && { printf "0"; return; } white_count="$(csvcut -c 6 source_rows.tmp | awk '{total += $1} END {print total}')" dead_count="$(csvcut -c 7 source_rows.tmp | awk '{total += $1} END {print total}')" redundant_count="$(csvcut -c 8 source_rows.tmp | awk '{total += $1} END {print total}')" parked_count="$(csvcut -c 9 source_rows.tmp | awk '{total += $1} END {print total}')" excluded_count="$((white_count + dead_count + redundant_count + parked_count))" - printf "%s" "$((excluded_count*100/raw_count))" # Print % excluded + printf "%s" "$((excluded_count*100/raw_count))" + rm source_rows.tmp } -function format_list { - bash functions/tools.sh "format" "$1" +# Function 'format_file' is a shell wrapper to standardize the format of a file +# $1: file to format +format_file() { + bash functions/tools.sh format "$1" } -main \ No newline at end of file +command -v csvgrep &> /dev/null || pip install -q csvkit # Install csvkit + +for file in config/* data/*; do + format_list "$file" +done + +update_readme diff --git a/functions/validate_raw.sh b/functions/validate_raw.sh index d164c4b90..1d00fad55 100644 --- a/functions/validate_raw.sh +++ b/functions/validate_raw.sh @@ -169,7 +169,6 @@ format_file() { trap 'find . -maxdepth 1 -type f -name "*.tmp" -delete' EXIT -# Format files in the config and data directory for file in config/* data/*; do format_file "$file" done