Optimize check_parked.sh

jarelllama · Mar 31, 2024 · d542ac3 · d542ac3
1 parent 069dadd
commit d542ac3
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 83 deletions.
diff --git a/functions/check_dead.sh b/functions/check_dead.sh
@@ -38,15 +38,15 @@ check_subdomains() {
     dead-domains-linter -i formatted_subdomains.tmp --export dead.tmp
     [[ ! -s dead.tmp ]] && return
 
-    # Remove dead subdomains from subdomains file
+    # Remove domains from subdomains file
     comm -23 "$SUBDOMAINS" dead.tmp > subdomains.tmp
     mv subdomains.tmp "$SUBDOMAINS"
 
-    # Cache dead subdomains to filter out from newly retrieved domains
+    # Cache dead domains to filter out from newly retrieved domains
     cat dead.tmp >> "$DEAD_DOMAINS"
     format_file "$DEAD_DOMAINS"
 
-    # Strip dead domains with subdomains to their root domains
+    # Strip dead domains to their root domains
     while read -r subdomain; do
         dead_root_domains="$(sed "s/^${subdomain}\.//" dead.tmp | sort -u)"
     done < "$SUBDOMAINS_TO_REMOVE"
@@ -67,22 +67,22 @@ check_redundant() {
     dead-domains-linter -i formatted_redundant_domains.tmp --export dead.tmp
     [[ ! -s dead.tmp ]] && return
 
-    # Remove dead redundant domains from redundant domains file
+    # Remove dead domains from redundant domains file
     comm -23 "$REDUNDANT_DOMAINS" dead.tmp > redundant.tmp
     mv redundant.tmp "$REDUNDANT_DOMAINS"
 
-    # Cache dead redundant domains to filter out from newly retrieved domains
+    # Cache dead domains to filter out from newly retrieved domains
     cat dead.tmp >> "$DEAD_DOMAINS"
     format_file "$DEAD_DOMAINS"
 
     # Find unused wildcard
     while read -r wildcard; do
         # If no matches, consider wildcard as unused/dead
-        ! grep -q "\.${wildcard}$" "$REDUNDANT_DOMAINS" &&
+        if ! grep -q "\.${wildcard}$" "$REDUNDANT_DOMAINS"; then
             printf "%s\n" "$wildcard" >> collated_dead_wildcards.tmp
+        fi
     done < "$WILDCARDS"
     [[ ! -f collated_dead_wildcards.tmp ]] && return
-    sort -u collated_dead_wildcards.tmp -o collated_dead_wildcards.tmp
 
     # Remove unused wildcards from raw file and wildcards file
     comm -23 "$RAW" collated_dead_wildcards.tmp > raw.tmp
@@ -114,7 +114,7 @@ check_alive() {
     # Find and export dead domains
     dead-domains-linter -i formatted_dead_domains.tmp --export dead.tmp
 
-    # Find resurrected domains in dead domains file (note dead domains file is unsorted)
+    # Find resurrected domains in dead domains file (dead domains file is unsorted)
     alive_domains="$(comm -23 <(sort "$DEAD_DOMAINS") <(sort dead.tmp))"
     [[ -z "$alive_domains" ]] && return
 
@@ -146,7 +146,7 @@ prune_dead_domains_file() {
     true
 }
 
-# Function 'log_event' logs domain processing events into the domain log
+# Function 'log_event' logs domain processing events into the domain log.
 # $1: domains to log stored in a variable.
 # $2: event type (dead, whitelisted, etc.)
 # $3: source
@@ -167,4 +167,5 @@ cleanup() {
 }
 
 trap cleanup EXIT
+
 main
diff --git a/functions/check_parked.sh b/functions/check_parked.sh
@@ -17,72 +17,85 @@ main() {
     add_unparked_domains
     update_light_file
 
-    # Cache parked domains (skip processing parked domains through unparked check)
+    # Cache parked domains (done last to skip unparked domains check)
     cat parked_domains.tmp >> "$PARKED_DOMAINS"
     format_file "$PARKED_DOMAINS"
 }
 
 remove_parked_domains() {
-    printf "\n[start] Analyzing %s entries for parked domains\n" "$(wc -l < "$RAW")"
+    # Reset split files before run
+    find . -maxdepth 1 -type f -name "x??" -delete
 
-    # Split raw file into 12 equal files
-    split -d -l $(($(wc -l < "$RAW")/12)) "$RAW"
-    check_parked "x00" & check_parked "x01" &
-    check_parked "x02" & check_parked "x03" &
-    check_parked "x04" & check_parked "x05" &
-    check_parked "x06" & check_parked "x07" &
-    check_parked "x08" & check_parked "x09" &
-    check_parked "x10" & check_parked "x11" &
-    check_parked "x12" & check_parked "x13"
-    wait
-    [[ ! -f parked_domains.tmp ]] && return
+    printf "\n[start] Analyzing %s entries for parked domains\n" "$(wc -l < "$RAW")"
 
-    format_file parked_domains.tmp
+    # Retrieve parked domains and return if none found
+    retrieve_parked "$RAW" || return
 
     # Remove parked domains from raw file
     comm -23 "$RAW" parked_domains.tmp > raw.tmp && mv raw.tmp "$RAW"
 
     log_event "$(<parked_domains.tmp)" "parked" "raw"
-
-    # Reset split files before next run
-    find . -maxdepth 1 -type f -name "x??" -delete
 }
 
 add_unparked_domains() {
-    printf "\n[start] Analyzing %s entries for unparked domains\n" "$(wc -l < "$RAW")"
-
-    # Split raw file into 12 equal files
-    split -d -l $(($(wc -l < "$PARKED_DOMAINS")/12)) "$PARKED_DOMAINS"
-    check_unparked "x00" & check_unparked "x01" &
-    check_unparked "x02" & check_unparked "x03" &
-    check_unparked "x04" & check_unparked "x05" &
-    check_unparked "x06" & check_unparked "x07" &
-    check_unparked "x08" & check_unparked "x09" &
-    check_unparked "x10" & check_unparked "x11" &
-    check_unparked "x12" & check_unparked "x13"
-    wait
-    [[ ! -f unparked_domains.tmp ]] && return
+    # Reset split files before run
+    find . -maxdepth 1 -type f -name "x??" -delete
+
+    printf "\n[start] Analyzing %s entries for unparked domains\n" "$(wc -l < "$PARKED_DOMAINS")"
 
-    format_file unparked_domains.tmp
+    # Retrieve parked domains and return if none found
+    retrieve_parked "$PARKED_DOMAINS" || return
 
-    # Remove unparked domains from parked domains file (parked domains file is unsorted)
-    grep -vxFf unparked_domains.tmp "$PARKED_DOMAINS" > parked.tmp
+    # Get unparked domains
+    unparked_domains="$(grep -vxFf parked_domains.tmp "$PARKED_DOMAINS")"
+
+    # Keep only parked domains in parked domains file
+    grep -xFF parked_domains.tmp "$PARKED_DOMAINS" > parked.tmp
     mv parked.tmp "$PARKED_DOMAINS"
 
     # Add unparked domains to raw file
-    cat unparked_domains.tmp >> "$RAW"
+    printf "%s\n" "$unparked_domains" >> "$RAW"
     format_file "$RAW"
 
-    log_event "$(<unparked_domains.tmp)" "unparked" "parked_domains_file"
+    log_event "$unparked_domains" "unparked" "parked_domains_file"
+}
 
-    # Reset split files before next run
-    find . -maxdepth 1 -type f -name "x??" -delete
+# Function 'retrieve_parked' efficiently checks for parked domains.
+# Input:
+#   $1: list of domains to check
+# Output:
+#   exit status 1 if no parked domains were found
+retrieve_parked() {
+    : > parked_domains.tmp  # Truncate parked domains (prevents missing file error)
+
+    # Split file into 12 equal files
+    split -d -l $(($(wc -l < "$1")/12)) "$1"
+
+    # Run checks in parallel
+    check_parked "x00" & check_parked "x01" &
+    check_parked "x02" & check_parked "x03" &
+    check_parked "x04" & check_parked "x05" &
+    check_parked "x06" & check_parked "x07" &
+    check_parked "x08" & check_parked "x09" &
+    check_parked "x10" & check_parked "x11" &
+    check_parked "x12" & check_parked "x13"
+    wait
+
+    # Return 1 if no parked domains were found
+    [[ ! -s parked_domains.tmp ]] && return 1
+
+    format_file parked_domains.tmp
 }
 
+# Function 'check_parked' queries sites for parked messages in their HTML.
+# Input:
+#   $1: list of domains to check
+# Output:
+#   parked_domains.tmp (if parked domains were found)
 check_parked() {
     [[ ! -f "$1" ]] && return
 
-    # Track progress for first split file
+    # Track progress only for first split file
     if [[ "$1" == 'x00' ]]; then
         local track=true
         local count=1
@@ -91,52 +104,25 @@ check_parked() {
     while read -r domain; do
         # Check for parked message in site's HTML
         if grep -qiFf "$PARKED_TERMS" \
-            <<< "$(curl -sL --max-time 2 "http://${domain}/" | tr -d '\0')"; then
+            <<< "$(curl -sL --max-time 5 "http://${domain}/" | tr -d '\0')"; then
             printf "[info] Found parked domain: %s\n" "$domain"
             printf "%s\n" "$domain" >> "parked_domains_${1}.tmp"
         fi
 
-        # Track progress for first split file
-        if [[ "$track" == true ]]; then
-            (( count % 100 == 0 )) &&
-                printf "[info] Analyzed %s%% of domains\n" "$((count * 100 / $(wc -l < "$1")))"
-            (( count++ ))
+        # Skip progress tracking if not first split file
+        [[ "$track" != true ]] && continue
+
+        if (( count % 100 == 0 )); then
+            printf "[info] Analyzed %s%% of domains\n" "$((count * 100 / $(wc -l < "$1")))"
         fi
+
+        (( count++ ))
     done < "$1"
 
     # Collate parked domains
-    [[ -f "parked_domains_${1}.tmp" ]] &&
+    if [[ -f "parked_domains_${1}.tmp" ]]; then
         cat "parked_domains_${1}.tmp" >> parked_domains.tmp
-}
-
-check_unparked() {
-    [[ ! -f "$1" ]] && return
-
-    # Track progress for first split file
-    if [[ "$1" == 'x00' ]]; then
-        local track=true
-        local count=1
     fi
-
-    while read -r domain; do
-        # Check for parked message in site's HTML
-        if ! grep -qiFf "$PARKED_TERMS" \
-            <<< "$(curl -sL --max-time 5 "http://${domain}/" | tr -d '\0')"; then
-            printf "[info] Found unparked domain: %s\n" "$domain"
-            printf "%s\n" "$domain" >> "unparked_domains_${1}.tmp"
-        fi
-
-        # Track progress for first split file
-        if [[ "$track" == true ]]; then
-            (( count % 100 == 0 )) &&
-                printf "[info] Analyzed %s%% of domains\n" "$((count * 100 / $(wc -l < "$1")))"
-            (( count++ ))
-        fi
-    done < "$1"
-
-    # Collate unparked domains
-    [[ -f "unparked_domains_${1}.tmp" ]] &&
-        cat "unparked_domains_${1}.tmp" >> unparked_domains.tmp
 }
 
 # Function 'update_light_file' removes any domains from the light raw file that