-
Notifications
You must be signed in to change notification settings - Fork 3
/
quasimodo
executable file
·136 lines (115 loc) · 3.91 KB
/
quasimodo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash
usage() { echo "Usage: $0 -s
-s : singleshot - stops at the first loop if it fails
-h : this help" 1>&2; exit $1; }
singleshot=0
while getopts "sh" o; do
case "${o}" in
s) singleshot=1 ;;
h) usage 0 ;;
*) usage 1 ;;
esac
done
runtimeout=3600
shorttimeout=300
do_the_drop() {
# TODO move this into a root user's forced-command with an SSH key
echo 3 | sudo tee /proc/sys/vm/drop_caches
}
drop_cache() {
echo dropping cache
printf '[%u]\t[%s]\tdrop cache reason:\t%s\n' "$(date --utc '+%s')" "$(date --rfc-3339=seconds)" "${1}" >>drop_cache.log
# empty some cache
do_the_drop
printf '[%u]\t[%s]\tcache droped\n' "$(date --utc '+%s')" "$(date --rfc-3339=seconds)" >>drop_cache.log
# sync the buffer to have more to drop
synctimeout=$(timeout -k 5 -s INT ${shorttimeout} grep -oP '(?<=^synctimeout=).*$' server.conf)
if [[ -z '$synctimeout' ]]; then
synctimeout=$runtimeout
fi
if (( synctimeout > 0)); then
if timeout -k 5 -s INT $synctimeout sync; then
printf '[%u]\t[%s]\tsynced (< %u s)\n' "$(date --utc '+%s')" "$(date --rfc-3339=seconds)" "${synctimeout}" >>drop_cache.log
else
printf '[%u]\t[%s]\tsync timed out (>= %u s)\n' "$(date --utc '+%s')" "$(date --rfc-3339=seconds)" "${synctimeout}" >>drop_cache.log
fi
else
sync
printf '[%u]\t[%s]\tsynced (long-wait)\n' "$(date --utc '+%s')" "$(date --rfc-3339=seconds)" >>drop_cache.log
fi
# drop the extra freed by sync
do_the_drop
printf '[%u]\t[%s]\tcache droped\n' "$(date --utc '+%s')" "$(date --rfc-3339=seconds)" >>drop_cache.log
}
ring_carillon() {
newtimeout=$(timeout -k 5 -s INT ${shorttimeout} grep -oP '(?<=^runtimeout=).*$' server.conf)
if [[ -n "${runtimeout}" ]]; then
runtimeout=${newtimeout}
fi
# kill any remaining rsync
if killall -3 rsync; then
echo killing stuck rsync processes
sleep 10
if killall rsync; then
sleep 10
killall -9 rsync
fi
# drop the cache (in case it help getting rsync processes unstuck
#drop_cache 'lingering rsync'
else
nfsmsg="$(dmesg -Lk | gawk -vT=$(($(date '+%s' --date="-${runtimeout} seconds") - $(date '+%s' --date="$(uptime --since)"))) '(substr($1,2)+0)>=(T+0) && $0~/nfs:.*not responding/')"
if [[ -n "$nfsmsg" ]]; then
echo "Since: $(date --rfc-3339=seconds --date="-${runtimeout} seconds")"
echo "$nfsmsg"
# drop the cache (in case of non-responding server)
#drop_cache "nfs not responding since ${runtimeout} sec ago"
fi
fi
# check if inode limitation "No spoace left on device" is still going on
if timeout -k 5 -s INT ${shorttimeout} touch b0rk && [[ -f b0rk ]]; then
rm b0rk
else
echo "Aargh: problem writing on storage !!!"
# TODO use carillon phases
scriptdir="$(dirname $(which $0))"
${scriptdir}/belfry df
cluster_user="${USER%%@*}"
cluster_user=$(timeout -k 5 -s INT ${shorttimeout} grep -oP '(?<=^cluster_user=).*$' server.conf)
remote_batman="ssh -ni ${HOME}/.ssh/id_ed25519_batman -l ${cluster_user} euler.ethz.ch --"
timeout -k 5 -s INT $shorttimeout ${remote_batman} df
date -R
return 1
fi
# run the carrillon script
echo "Starting loop for: $runtimeout sec"
timeout -k 5 -s INT $runtimeout ./carillon
local retval=$?
timeout -k 5 -s INT $shorttimeout touch $(dirname $(which $0))/status/loop_done
# report NFS status
dmesg -LTk| grep -P 'nfs:.*server \S* (OK|not responding)' --colour=always|tail -n 1
return $retval
}
stopfile="$(dirname $(which $0))/status/stop"
# remove previous abort file
if [[ -e "${stopfile}" ]]; then
echo "(removing previous stop file)"
rm "${stopfile}"
fi
echo 'First run...'
ring_carillon || (( singleshot == 0 )) || exit 1
while sleep 1200; do
# re-enter directory (in case a NFS crash has rendered the previous CWD handle stale)
workpath=$(dirname $(which $0))
cd ~
cd "${workpath}"
echo 'loop...'
/usr/bin/kinit -l 1h -k -t $HOME/$USER.keytab ${USER%@*}@D.ETHZ.CH;
ring_carillon
# check for abort
if [[ -e "${stopfile}" ]]; then
rm "${stopfile}"
exit 0
fi
date -R;
klist;
done