-
Notifications
You must be signed in to change notification settings - Fork 0
/
cron.sh
102 lines (102 loc) · 3.93 KB
/
cron.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/bin/bash
#
# JournalList.net cron shell script.
#
# Name - cron.sh
# Synopsis - cron.sh [DIRNAME]
# DIRNAME - optional, the directory in which to place results. Default DIRNAME is "Webcrawl-YYYY-MM-DD".
#
# Copyright (c) 2021 Brown Wolf Consulting LLC
# License: Creative Commons Attribution-NonCommercial-ShareAlike license. See: https://creativecommons.org/
#
#-------------------------------------
date
#
# Check for resources.csv file, if it doesn't exist you need to download it before running the srcipt
#
if [ ! -f "resources.csv" ]
then
echo "Download resources.csv from https://well-known.dev/?q=resource%3Atrust.txt"
else
#
# Set DIRNAME
#
if [ $# == 1 ]
then
DIRNAME=$1
else
DIRNAME=( $(date "+Webcrawl-%Y-%m-%d") )
fi
#
# If the directory doesn't already exist. Run the python webcrawler.
#
if [ ! -d $DIRNAME ]
then
#
# Generage a resources.csv from all of the historic resources.csv file (downloaded from https://well-known.dev/?q=resource%3Atrust.txt#results)
# and copy it to the Webcrawl directory to preserve the results for that day.
#
echo "rank,domain,resource,status,scanned,simhash" > temp.csv
cat resources.csv Webcrawl-*/Webcrawl-*-resources.csv | grep -v "rank,domain,resource,status," | sed -e "s/^[0-9]*,/,/" -e "s/,202.*$/,,/" | sort | uniq >> temp.csv
mv temp.csv resources.csv
#
# Run the webcrawler
#
python3.12 webcrawler.py
#
# Save resources.csv
#
mv resources.csv $DIRNAME/$DIRNAME-resources.csv
fi
#
# Remove duplicate entries. First remove header line, then pipe to sort and uniq, next reinsert header line, and follow it with the sorted-unique list of entries.
#
tail -n +2 $DIRNAME/$DIRNAME.csv | sort | uniq > $DIRNAME/temp
echo "srcurl,attr,refurl" > $DIRNAME/$DIRNAME.csv
cat $DIRNAME/temp >> $DIRNAME/$DIRNAME.csv
rm $DIRNAME/temp
#
# Process the results of the webcrawler to generate the symmetric links, association, publisher, and vendor .csv files.
#
echo ".import $DIRNAME/$DIRNAME.csv trust_txt" > temp.sql
echo ".import $DIRNAME/$DIRNAME-err.csv http_errors" >> temp.sql
echo ".read symmetric.sql" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-symmetric.csv" >> temp.sql
echo "select distinct * from symmetric_list;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-asymmetric.csv" >> temp.sql
echo "select distinct * from asymmetric_list;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-associations.csv" >> temp.sql
echo "select distinct * from associations_list;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-publishers.csv" >> temp.sql
echo "select distinct * from publishers_list;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-vendors.csv" >> temp.sql
echo "select distinct * from vendors_list;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-controlled.csv" >> temp.sql
echo "select distinct * from controlled_list;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-control_dups.csv" >> temp.sql
echo "select distinct * from control_dups;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-controlledby_dups.csv" >> temp.sql
echo "select distinct * from controlledby_dups;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-missctrlby.csv" >> temp.sql
echo "select * from missctrlby_list;" >> temp.sql
echo ".output $DIRNAME/$DIRNAME-stats.csv" >> temp.sql
echo "select * from stats;" >> temp.sql
echo ".quit" >> temp.sql
#
cat temp.sql | sqlite3 -init init.sql 2> /dev/null
rm temp.sql
#
# Process the symmetric.csv file to generate the .graphml files.
#
python3.12 graphml.py $DIRNAME
#
# Generate the JSON files for ArangoDB graph database
#
bash genjson.sh $DIRNAME
fi
#
# Output the start time and end time
#
echo "JournalList trust.txt webcrawl"
head -n 1 $DIRNAME/$DIRNAME-log.txt
tail -n 1 $DIRNAME/$DIRNAME-log.txt