Skip to content

Commit

Permalink
Optimize check_parked.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
jarelllama authored Mar 31, 2024
1 parent 069dadd commit d542ac3
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 83 deletions.
19 changes: 10 additions & 9 deletions functions/check_dead.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ check_subdomains() {
dead-domains-linter -i formatted_subdomains.tmp --export dead.tmp
[[ ! -s dead.tmp ]] && return

# Remove dead subdomains from subdomains file
# Remove domains from subdomains file
comm -23 "$SUBDOMAINS" dead.tmp > subdomains.tmp
mv subdomains.tmp "$SUBDOMAINS"

# Cache dead subdomains to filter out from newly retrieved domains
# Cache dead domains to filter out from newly retrieved domains
cat dead.tmp >> "$DEAD_DOMAINS"
format_file "$DEAD_DOMAINS"

# Strip dead domains with subdomains to their root domains
# Strip dead domains to their root domains
while read -r subdomain; do
dead_root_domains="$(sed "s/^${subdomain}\.//" dead.tmp | sort -u)"
done < "$SUBDOMAINS_TO_REMOVE"
Expand All @@ -67,22 +67,22 @@ check_redundant() {
dead-domains-linter -i formatted_redundant_domains.tmp --export dead.tmp
[[ ! -s dead.tmp ]] && return

# Remove dead redundant domains from redundant domains file
# Remove dead domains from redundant domains file
comm -23 "$REDUNDANT_DOMAINS" dead.tmp > redundant.tmp
mv redundant.tmp "$REDUNDANT_DOMAINS"

# Cache dead redundant domains to filter out from newly retrieved domains
# Cache dead domains to filter out from newly retrieved domains
cat dead.tmp >> "$DEAD_DOMAINS"
format_file "$DEAD_DOMAINS"

# Find unused wildcard
while read -r wildcard; do
# If no matches, consider wildcard as unused/dead
! grep -q "\.${wildcard}$" "$REDUNDANT_DOMAINS" &&
if ! grep -q "\.${wildcard}$" "$REDUNDANT_DOMAINS"; then
printf "%s\n" "$wildcard" >> collated_dead_wildcards.tmp
fi
done < "$WILDCARDS"
[[ ! -f collated_dead_wildcards.tmp ]] && return
sort -u collated_dead_wildcards.tmp -o collated_dead_wildcards.tmp

# Remove unused wildcards from raw file and wildcards file
comm -23 "$RAW" collated_dead_wildcards.tmp > raw.tmp
Expand Down Expand Up @@ -114,7 +114,7 @@ check_alive() {
# Find and export dead domains
dead-domains-linter -i formatted_dead_domains.tmp --export dead.tmp

# Find resurrected domains in dead domains file (note dead domains file is unsorted)
# Find resurrected domains in dead domains file (dead domains file is unsorted)
alive_domains="$(comm -23 <(sort "$DEAD_DOMAINS") <(sort dead.tmp))"
[[ -z "$alive_domains" ]] && return

Expand Down Expand Up @@ -146,7 +146,7 @@ prune_dead_domains_file() {
true
}

# Function 'log_event' logs domain processing events into the domain log
# Function 'log_event' logs domain processing events into the domain log.
# $1: domains to log stored in a variable.
# $2: event type (dead, whitelisted, etc.)
# $3: source
Expand All @@ -167,4 +167,5 @@ cleanup() {
}

trap cleanup EXIT

main
134 changes: 60 additions & 74 deletions functions/check_parked.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,72 +17,85 @@ main() {
add_unparked_domains
update_light_file

# Cache parked domains (skip processing parked domains through unparked check)
# Cache parked domains (done last to skip unparked domains check)
cat parked_domains.tmp >> "$PARKED_DOMAINS"
format_file "$PARKED_DOMAINS"
}

remove_parked_domains() {
printf "\n[start] Analyzing %s entries for parked domains\n" "$(wc -l < "$RAW")"
# Reset split files before run
find . -maxdepth 1 -type f -name "x??" -delete

# Split raw file into 12 equal files
split -d -l $(($(wc -l < "$RAW")/12)) "$RAW"
check_parked "x00" & check_parked "x01" &
check_parked "x02" & check_parked "x03" &
check_parked "x04" & check_parked "x05" &
check_parked "x06" & check_parked "x07" &
check_parked "x08" & check_parked "x09" &
check_parked "x10" & check_parked "x11" &
check_parked "x12" & check_parked "x13"
wait
[[ ! -f parked_domains.tmp ]] && return
printf "\n[start] Analyzing %s entries for parked domains\n" "$(wc -l < "$RAW")"

format_file parked_domains.tmp
# Retrieve parked domains and return if none found
retrieve_parked "$RAW" || return

# Remove parked domains from raw file
comm -23 "$RAW" parked_domains.tmp > raw.tmp && mv raw.tmp "$RAW"

log_event "$(<parked_domains.tmp)" "parked" "raw"

# Reset split files before next run
find . -maxdepth 1 -type f -name "x??" -delete
}

add_unparked_domains() {
printf "\n[start] Analyzing %s entries for unparked domains\n" "$(wc -l < "$RAW")"

# Split raw file into 12 equal files
split -d -l $(($(wc -l < "$PARKED_DOMAINS")/12)) "$PARKED_DOMAINS"
check_unparked "x00" & check_unparked "x01" &
check_unparked "x02" & check_unparked "x03" &
check_unparked "x04" & check_unparked "x05" &
check_unparked "x06" & check_unparked "x07" &
check_unparked "x08" & check_unparked "x09" &
check_unparked "x10" & check_unparked "x11" &
check_unparked "x12" & check_unparked "x13"
wait
[[ ! -f unparked_domains.tmp ]] && return
# Reset split files before run
find . -maxdepth 1 -type f -name "x??" -delete

printf "\n[start] Analyzing %s entries for unparked domains\n" "$(wc -l < "$PARKED_DOMAINS")"

format_file unparked_domains.tmp
# Retrieve parked domains and return if none found
retrieve_parked "$PARKED_DOMAINS" || return

# Remove unparked domains from parked domains file (parked domains file is unsorted)
grep -vxFf unparked_domains.tmp "$PARKED_DOMAINS" > parked.tmp
# Get unparked domains
unparked_domains="$(grep -vxFf parked_domains.tmp "$PARKED_DOMAINS")"

# Keep only parked domains in parked domains file
grep -xFF parked_domains.tmp "$PARKED_DOMAINS" > parked.tmp
mv parked.tmp "$PARKED_DOMAINS"

# Add unparked domains to raw file
cat unparked_domains.tmp >> "$RAW"
printf "%s\n" "$unparked_domains" >> "$RAW"
format_file "$RAW"

log_event "$(<unparked_domains.tmp)" "unparked" "parked_domains_file"
log_event "$unparked_domains" "unparked" "parked_domains_file"
}

# Reset split files before next run
find . -maxdepth 1 -type f -name "x??" -delete
# Function 'retrieve_parked' efficiently checks for parked domains.
# Input:
# $1: list of domains to check
# Output:
# exit status 1 if no parked domains were found
retrieve_parked() {
: > parked_domains.tmp # Truncate parked domains (prevents missing file error)

# Split file into 12 equal files
split -d -l $(($(wc -l < "$1")/12)) "$1"

# Run checks in parallel
check_parked "x00" & check_parked "x01" &
check_parked "x02" & check_parked "x03" &
check_parked "x04" & check_parked "x05" &
check_parked "x06" & check_parked "x07" &
check_parked "x08" & check_parked "x09" &
check_parked "x10" & check_parked "x11" &
check_parked "x12" & check_parked "x13"
wait

# Return 1 if no parked domains were found
[[ ! -s parked_domains.tmp ]] && return 1

format_file parked_domains.tmp
}

# Function 'check_parked' queries sites for parked messages in their HTML.
# Input:
# $1: list of domains to check
# Output:
# parked_domains.tmp (if parked domains were found)
check_parked() {
[[ ! -f "$1" ]] && return

# Track progress for first split file
# Track progress only for first split file
if [[ "$1" == 'x00' ]]; then
local track=true
local count=1
Expand All @@ -91,52 +104,25 @@ check_parked() {
while read -r domain; do
# Check for parked message in site's HTML
if grep -qiFf "$PARKED_TERMS" \
<<< "$(curl -sL --max-time 2 "http://${domain}/" | tr -d '\0')"; then
<<< "$(curl -sL --max-time 5 "http://${domain}/" | tr -d '\0')"; then
printf "[info] Found parked domain: %s\n" "$domain"
printf "%s\n" "$domain" >> "parked_domains_${1}.tmp"
fi

# Track progress for first split file
if [[ "$track" == true ]]; then
(( count % 100 == 0 )) &&
printf "[info] Analyzed %s%% of domains\n" "$((count * 100 / $(wc -l < "$1")))"
(( count++ ))
# Skip progress tracking if not first split file
[[ "$track" != true ]] && continue

if (( count % 100 == 0 )); then
printf "[info] Analyzed %s%% of domains\n" "$((count * 100 / $(wc -l < "$1")))"
fi

(( count++ ))
done < "$1"

# Collate parked domains
[[ -f "parked_domains_${1}.tmp" ]] &&
if [[ -f "parked_domains_${1}.tmp" ]]; then
cat "parked_domains_${1}.tmp" >> parked_domains.tmp
}

check_unparked() {
[[ ! -f "$1" ]] && return

# Track progress for first split file
if [[ "$1" == 'x00' ]]; then
local track=true
local count=1
fi

while read -r domain; do
# Check for parked message in site's HTML
if ! grep -qiFf "$PARKED_TERMS" \
<<< "$(curl -sL --max-time 5 "http://${domain}/" | tr -d '\0')"; then
printf "[info] Found unparked domain: %s\n" "$domain"
printf "%s\n" "$domain" >> "unparked_domains_${1}.tmp"
fi

# Track progress for first split file
if [[ "$track" == true ]]; then
(( count % 100 == 0 )) &&
printf "[info] Analyzed %s%% of domains\n" "$((count * 100 / $(wc -l < "$1")))"
(( count++ ))
fi
done < "$1"

# Collate unparked domains
[[ -f "unparked_domains_${1}.tmp" ]] &&
cat "unparked_domains_${1}.tmp" >> unparked_domains.tmp
}

# Function 'update_light_file' removes any domains from the light raw file that
Expand Down

0 comments on commit d542ac3

Please sign in to comment.