-
Notifications
You must be signed in to change notification settings - Fork 0
/
master_scalar.bash
189 lines (153 loc) · 6.93 KB
/
master_scalar.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/bin/bash
function usage () {
cat <<EOUSAGE
$(basename $0) :hicwev
master.bash should be used with arguments. Complete list:
-h help
-i Matplotlib colormap for the raw (instantaneous) SSH (needs to be run by create_pics_ssavehw.bash)
-c Matplotlib colormap for the cosine-fitted (wave) SSH (needs to be run by create_pics_ssavehw.bash)
-w Series to test the model on (format: wp*8_[ab])
-t Series to train the model on (format:wp*8_[cdefg])(Note: can also contain test images, thanks to the order we move things.)
-e Total number of epochs for training
-m Memory for each job
-d How many jobs to span the total number of epochs. Ideally mod(e/d)=0
-s How many seconds each epoch takes
-l Parameter Lambda controling the L1 loss
EOUSAGE
}
#Can not be run on log-in mode; I'm running it on interactive sessions, with 16GB memory
#hw: changed from Nico's version, so that this runs exclusively for TensorfFlow 2. Supressing the -v, tfv options related stuff, and replace codes with tf2 counterparts.
echo 'Creating and submitting a new experiment.'
echo 'See master.bash -h for help on arguments.'
# All arguments are optional, here we assign them
while getopts :hi:c:w:t:e:m:d:s:l:n: opt; do #****"hi" may be wrong
case $opt in #So probably the syntax is : ./master -i gray -c gray
h) usage; exit 1;;
i) cm_ins=${OPTARG};;
c) cm_cos=${OPTARG};;
w) wp_test=${OPTARG};;
t) wp_train=${OPTARG};;
e) n_epochs=${OPTARG};;
m) n_mem=${OPTARG};;
d) divide=${OPTARG};;
s) epoch_sec=${OPTARG};;
l) LAMBDA=${OPTARG};;
n) n_layers=${OPTARG};;
\?) echo "Invalid option -${OPTARG}" >&2
usage; exit 2;;
esac
done
#These are assigning the default values in case the command line has no input
cm_ins=${cm_ins:-scalar}
cm_cos=${cm_cos:-scalar}
wp_test=${wp_test:-wp50*-3}
wp_train=${wp_train:-wp*-3}
n_epochs=${n_epochs:-700}
n_mem=${n_mem:-30}
divide=${divide:-7}
epoch_sec=${epoch_sec:-90}
LAMBDA=${LAMBDA:-1000}
n_layers=${n_layers:-3}
cat <<EOF
The experiment has the following parameters:
wp_test = ${wp_test}
wp_train = ${wp_train}
n_epochs = ${n_epochs}
LAMBDA = ${LAMBDA}
NLAYERS = ${n_layers}
EOF
echo 'n_mem:' $n_mem
echo 'epoch_sec:' $epoch_sec
nepoch_eachjob=$((n_epochs / divide)) #Note: bash can only divide integers
divn=$(( 3600 / epoch_sec ))
n_hours=$(( (nepoch_eachjob + 10)/ divn )) # for each sub-job. Note that shell divisions are all integar-based.
#n_hours=$(( n_hours + 1 )) # +1 to round up (combined with previous commands, this is equivalent to ceil())
printf -v n_hours "%02d" $n_hours # padding with zero (-v means printing to variable)
# Graham does not accept jobs longer than 24 hrs. Unlikely so not failsafing
echo 'n_hours for each job:' $n_hours
echo 'total epochs:' $n_epochs
echo 'epoch number in each job:' $nepoch_eachjob
# Prepare virtualenv
module load python/3.8 #Changed to 3.8
#if [ "$tfv" == 2 ]; then
printf "\n\n :X:X:X TensorfFlow v2 :X:X:X \n\n"
tfenvdir=/home/hannn/pix2pix-for-swot/tfV2envre
process_script=process_scalar.py
#p2p_script_train_first=pix2pix_TF2_hw_train_first.py
p2p_script_train=pix2pix_TF2_hw_train_scalar.py
#else
# tfenvdir=/home/hannn/pix2pix-for-swot/tfV1env
# process_script=process.py
# p2p_script=pix2pix.py
#fi
if [ ! -d "$tfenvdir" ]; then
# we need tensorflow for the process.py script
virtualenv --no-download $tfenvdir
source $tfenvdir/bin/activate
pip install --no-index tensorflow_cpu #***check version
# pip install --no-index tensorflow_cpu==1.14.1
else
source $tfenvdir/bin/activate
fi
# # Prepare data
setup=lambda_${LAMBDA}_nlayers_${n_layers}_testing-${wp_test}_${n_epochs}epochs_w_rotflipud #Change the job name to this for debugging memory leak
#Delete some special characters:
setup=$(echo $setup | tr '][*-' '_') #Important for a proper function of glob. in python etc.
setup=$(echo $setup | tr -s '_')
echo "Name of this experiment: ${setup}"
picpath=/home/hannn/projects/def-ngrisoua/hannn/ifremer-pics_scalar
codedir=/home/hannn/pix2pix-for-swot/p2p_tf2_hw
outdir=/scratch/hannn/pix2pix-for-swot/$setup
if [ ! -d "$outdir" ]; then #-d is a operator to test if the given directory exists or not.
mkdir $outdir
else
rm -r $outdir # careful if checkpointing
mkdir $outdir
fi
echo ' '
#------------------------------------Caption below if it's already done.
echo 'Combining colormaps ' $cm_ins ' and ' $cm_cos
python $codedir/tools/${process_script} \
--input_dir $picpath/$cm_ins/ssh_ins \
--b_dir $picpath/$cm_cos/ssh_cos \
--operation combine \
--output_dir $outdir/combined
#------------------------------------Caption above if it's already done.
deactivate # deactivate the tensorflow environment
echo 'Done: process_script. Now dividing into test and train folders'
cd $outdir
cp $codedir/lanceur_scalar.slrm .
cp -r $codedir $outdir/. #added by hw
#cp $codedir/$p2p_script_train_first $outdir/.
cp $codedir/$p2p_script_train $outdir/.
#cp -r $codedir/tools/ $outdir/.
mkdir train test # careful again if checkpointing or testing
mkdir validation #newly added on 2021/05/29
#Note: since we move the test images first, the -t (training images) in the command line can contain test images too.
mv combined/${wp_test}*.npy test/. #Probably no need to shuffle #Moving the test data to test/.
mv combined/${wp_train}*.npy train/. #hw added so that we can specify what to train
#randomly select 20 percent of the snapshots in training data for validation:
#Note: we just randomly select the SNAPSHOTS, and include all the top/mid/bot panels in those snapshots.
echo "WARNING: the way we choose validation data only works for panels denoted by top/mid/bot!"
nalltop=$(ls train/*top* | wc -l) #number of filenames containing "top"
nvalidationtop=$(($nalltop/5))
validationfilestop=$(ls train/*top* | shuf -n $nvalidationtop)
for filename in $validationfilestop
do
validationfilestr=${filename%top.npy} #Remove "top.npy" from the back of the string.
#echo $validationfilestr
mv ${validationfilestr}*.npy validation/.
done
tar cf data.tar train test validation # archive for $SLURM_TMPDIR # useful in pix2pix.py
echo 'submitting SLURM jobs'
ed -s "lanceur_scalar.slrm" <<< $'g/XXXX/s/XXXX/'${cm_ins}$'/g\nw\nq' #Replacing XXXX with cm_ins
ed -s "lanceur_scalar.slrm" <<< $'g/YYYY/s/YYYY/'${cm_cos}$'/g\nw\nq'
ed -s "lanceur_scalar.slrm" <<< $'g/XYXY/s/XYXY/'${nepoch_eachjob}$'/g\nw\nq'
ed -s "lanceur_scalar.slrm" <<< $'g/ZXZX/s/ZXZX/'${n_hours}$'/g\nw\nq'
ed -s "lanceur_scalar.slrm" <<< $'g/MEMO/s/MEMO/'${n_mem}$'/g\nw\nq' #added for memory control
ed -s "lanceur_scalar.slrm" <<< $'g/ZZZZ/s/ZZZZ/'${p2p_script_train}$'/g\nw\nq'
ed -s "lanceur_scalar.slrm" <<< $'g/NJOBS/s/NJOBS/'${divide}$'/g\nw\nq'
ed -s "lanceur_scalar.slrm" <<< $'g/LLLL/s/LLLL/'${LAMBDA}$'/g\nw\nq'
ed -s "lanceur_scalar.slrm" <<< $'g/NLNL/s/NLNL/'${n_layers}$'/g\nw\nq'
sbatch --job-name=$setup lanceur_scalar.slrm
exit