This repository has been archived by the owner on Sep 20, 2024. It is now read-only.
forked from GovTechSG/oobee
-
Notifications
You must be signed in to change notification settings - Fork 5
/
run.sh
302 lines (253 loc) · 10.4 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#! /bin/bash
# For common shell function
. shell_functions/commonShellFunctions.sh
# For Apify environment settings
. shell_functions/settings.sh
# Check if installer script is executed beforehand
check_installer_status
# Delete .a11y_storage and results/current to remove previous scan data
clean_up
#0 == SUCCESS, 1 == FAIL
_valid_url=1
echo "Welcome to CivicActions' Purple Hats Accessibility Testing Tool!"
DOMAINNAME="" # https://www.example.com
SCANTYPE="" # domain or sitemap
EMAIL="" # Send HTML email to on completion
WAPPALYZER=0 # Set to 1 to enable wappalyzer
EXCLUDEEXT="" # Exclude by file extension (include .)
EXCLUDEMORE="" # Exclude by string separated by ","
EXCLUDEQUERY=0 # Set to 1 to exclude URLs with queries
NUMBER=2000 # NOT WORKING WITH Apify - Maximum number of pages to crawl
OPENBROWSER=1 # By default open a browser after the script is run
# usage() { # Function: Print a help message.
# echo "Usage: $0 [ -d DOMAINNAME ] [ -s SCANTYPE ] [ -e EMAIL ] [ -x EXCLUDEEXT ] [ -n NUMBER ]" 1>&2
# }
#exit_abnormal() { # Function: Exit with error.
# usage
# exit 1
#}
while getopts ":d:s:t:e:o:w:x:y:z:n:" options; do # Loop: Get the next option;
# use silent error checking;
# options n and t take arguments.
case "${options}" in #
d) # If the option is d,
DOMAINNAME=${OPTARG} # Set $DOMAINNAME to specified value.
;;
s)
SCANTYPE=${OPTARG} # Set $SCANTYPE to specified value.
;;
t)
TIME2WAIT=${OPTARG} # Set $TIME2WAIT between crawls: 0
;;
e)
EMAIL=${OPTARG} # Set $EMAIL to specified value.
;;
o)
OPENBROWSER=${OPTARG} # Disable launching browser: 0
;;
w)
WAPPALYZER=${OPTARG} # Enable wappalyzer: 1
;;
x)
EXCLUDEEXT=${OPTARG} # Set $EXCLUDEEXT to specified file extension.
;;
y)
EXCLUDEMORE=${OPTARG} # Set $EXCLUDEMORE to string in the URL.
;;
z)
EXCLUDEQUERY=${OPTARG} # Exclude queries (like ?q=40) with: 1
;;
n) # If the option is n,
NUMBER=${OPTARG} # Set $NUMBER to specified value.
re_isanum='^[0-9]+$' # Regex: match whole numbers only
if ! [[ $NUMBER =~ $re_isanum ]]; then # if $TIMES not whole:
echo "Error: NUMBER must be a positive, whole number."
exit_abnormal
exit 1
elif [ $NUMBER -eq "0" ]; then # If it's zero:
echo "Error: NUMBER must be greater than zero."
# exit_abnormal # Exit abnormally.
fi
;;
:) # If expected argument omitted:
echo "Error: -${OPTARG} requires an argument."
# exit_abnormal # Exit abnormally.
;;
*) # If unknown (any other) option:
# exit_abnormal # Exit abnormally.
;;
esac
done
echo "$DOMAINNAME was submitted."
if [ -n "$DOMAINNAME" ]; then
page=$DOMAINNAME
if [ "$SCANTYPE" = "sitemap" ]; then
scanType="sitemap"
crawler="crawlSitemap"
else
scanType="website"
crawler="crawlDomain"
fi
# Without variables provide prompt message
else
echo "You can specify a website crawl & URL through the command line with:"
echo " $ bash ./run website https://example.com/"
echo "What would you like to scan today?"
options=("sitemap file containing links" "website")
# Get information related to scan type as well as the URL for URL validation
select opt in "${options[@]}"
do
case $opt in
"sitemap file containing links")
scanType="sitemap"
crawler=crawlSitemap
prompt_message="Please enter URL to sitemap: "
break;;
"website")
prompt_website
break;;
"exit")
exit;;
*)
echo "Invalid option $REPLY";;
esac
done
# Prompt for URL (Common across all scan types)
read -p "$prompt_message" page
fi
# URL validation
while [ "$_valid_url" != 0 ]
do
check_url
if [ "$page" = "exit" ]; then
exit
elif [ -n $check_url_status ] && [ $check_url_status = 0 ]; then
_valid_url=0
else
# Prompt error message to rectify error for URL
if [ $scanType = "sitemap" ]; then
sitemap_error
else
website_error
fi
fi
done
# Run the crawler
randomToken=$(date +%s)$(openssl rand -hex 5)
currentDate=$(date '+%Y-%-m-%-d')
echo "Scanning website $page ... "
if [[ $WAPPALYZER == 1 ]]; then
# optional ability to gather information about the frameworks involved
echo "Wappalyzer enabled"
if [ -f "wappalyzer/src/drivers/npm/cli.js" ]; then
echo " and running on $page"
cd wappalyzer
# wappalyzer = $(node "src/drivers/npm/cli.js" "$page" | tee errors.txt)
wappalyzer=$( node "src/drivers/npm/cli.js" "$page")
cd ..
else
echo "Wappalyzer not installed in ./wappalyzer folder."
fi
fi
URL="$page" LOGINID="$login_id" LOGINPWD="$login_pwd" IDSEL="$id_selector" PWDSEL="$pwd_selector" SUBMIT="$btn_selector" RANDOMTOKEN="$randomToken" TYPE="$crawler" TIME2WAIT="$TIME2WAIT" WAPPALYZER="$wappalyzer" NUMBER="$NUMBER" EMAIL="$EMAIL" EXCLUDEEXT="$EXCLUDEEXT" EXCLUDEMORE="$EXCLUDEMORE" EXCLUDEQUERY="$EXCLUDEQUERY" node -e 'require("./combine").combineRun()' | tee errors.txt
# Verify that the newly generated directory exists
if [ -d "results/$currentDate/$randomToken" ]; then
domain=$(echo "$page" | awk -F/ '{print $3}')
# Add simlinks for simpler access
echo "Adding symbolic link to last scan of $domain in run.sh."
ln -sfn "results/$currentDate/$randomToken" "last-scan"
cd results
# Copy over other links only if the .html report is successfully written
if ls "$currentDate/$randomToken/reports/report.html" >> /dev/null 2>&1; then
echo "Adding symbolic links for date and domain."
ln -sfn "$currentDate/$randomToken"
cd "$currentDate"
ln -sfn "$randomToken" "$domain"
cd ../..
# Compress most files and delete originals.
echo "Compressing files."
cd last-scan
tar -cjf "$domain-$currentDate-all_issues.tar.bz2" "all_issues" 2>/dev/null
rm -fr "all_issues"
cd reports
tar -cjvf "$domain-$currentDate-compiledResults.json.tar.bz2" "compiledResults.json" 2>/dev/null
rm "compiledResults.json"
tar -cjvf "$domain-$currentDate-report.html.tar.bz2" "report.html" 2>/dev/null
tar -cjvf "$domain-$currentDate-allissues.csv.bz2" "allissues.csv" 2>/dev/null
zip "$domain-$currentDate-allissues.csv.zip" "allissues.csv" 2>/dev/null
rm "allissues.csv"
tar -cjvf "$domain-$currentDate-plainLanguage.csv.bz2" "plainLanguage.csv" 2>/dev/null
rm "plainLanguage.csv"
pwd
cp "$domain-$currentDate-allissues.csv.bz2" ../../../../data/
cp "$domain-$currentDate-report.html.tar.bz2" ../../../../data/
cp "$domain-$currentDate-plainLanguage.csv.bz2" ../../../../data/
cd ../..
pwd
# Make directory for domain and store prior scans
echo "Adding domain tracking - $domain."
cd results
ln -sfn "$currentDate/$randomToken" "last-scan"
if [ -d "${domain}_reports" ]; then
echo "Using existing ${domain}_reports directory."
else
echo "Making ${domain}_reports directory."
mkdir "${domain}_reports"
fi
cd "${domain}_reports"
ln -sfn "../$currentDate/$randomToken" "$currentDate"
echo "Renaming reports to make it easier to access from last-scan."
cd ../last-scan/reports
ls count.csv
mv report.html "$domain-$currentDate-report.html"
mv count.csv "$domain-$currentDate-count.csv"
mv wcagErrors.csv "$domain-$currentDate-wcagErrors.csv"
ln -sfn "$domain-$currentDate-report.html" report.html
ln -sfn "$domain-$currentDate-count.csv" count.csv
ln -sfn "$domain-$currentDate-wcagErrors.csv" wcagErrors.csv
pwd
# ls ../../../../data/
cp "$domain-$currentDate-count.csv" ../../../../data/
cp "$domain-$currentDate-wcagErrors.csv" ../../../../data/
cd ..
ln -sfn "reports/$domain-$currentDate-report.html" report.html
cd ../..
# Add to Git report
# Todo: This should be something that is explicitly set from the command line
pwd
# This isn't working properly
# git checkout -B new_data_branch
# git add "./data/$domain-$currentDate-count.csv"
# git add "data/$domain-$currentDate-wcagErrors.csv data/$domain-$currentDate-allissues.csv.bz2 data/$domain-$currentDate-report.html.tar.bz2 data/$domain-$currentDate-plainLanguage.csv.bz2"
# git commit -m "updated data"
# git push
# git push --setupstream origin main
# Test for the command before attempting to open the report
if [[ "$OPENBROWSER" == 1 ]]; then
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
echo "Open report in Firefox."
firefox -url "last-scan/report.html"
elif [[ "$OSTYPE" == "darwin"* ]]; then
echo "Open report in default browser"
open "last-scan/report.html"
else
echo "The scan has been completed."
current_dir=$(pwd)
reportPath="$current_dir/results/$currentDate/$randomToken/reports/report.html"
echo "You can find the report in $reportPath."
fi # End [[ "$OPENBROWSER" == 1 ]]; then
fi
else
echo "No report.html file for $domain in run.sh."
fi
# Provide PDF version if available
# NOTE: This should just be done with puppeteer which is already required - https://github.com/puppeteer/puppeteer/
if command -v wkhtmltopdf &> /dev/null; then
# I should be able to use --print-media-type
wkhtmltopdf -q --enable-javascript --javascript-delay 10000 "last-scan/reports/report.html" "last-scan/reports/$domain-$currentDate-report.pdf"
else
echo "If you want a PDF export, then install wkhtmltopdf - i.e. brew install wkhtmltopdf.";
fi # End command -v wkhtmltopdf
else
echo "WARNING: An unexpected error has occurred. Please try again later."
fi