-
Notifications
You must be signed in to change notification settings - Fork 3
/
torchrun.sh
24 lines (20 loc) · 1.05 KB
/
torchrun.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/bash
config="$1"
port=12356
nodes=(`srun hostname | sort -u | xargs echo`)
nnodes="${#nodes[@]}"
master="${nodes[0]}"
echo '================================================================================'
echo "nodes = ${nodes[@]}"
echo "master = $master"
echo '================================================================================'
for i in ${!nodes[@]} ; do
node="${nodes[$i]}"
echo '================================================================================'
echo $node
echo '--------------------------------------------------------------------------------'
echo 'srun -w' "$node" '-N1 torchrun --nproc_per_node=gpu --nnodes='"$nnodes" '--node_rank='"$i" '--master_addr='"$master" '--master_port='"$port" 'run_training.py' "$config" "${@:2}" "&"
srun -w "$node" -N1 torchrun --nproc_per_node=gpu --nnodes="$nnodes" --node_rank="$i" --master_addr="$master" --master_port="$port" run_training.py "$config" ${@:2} &
echo '================================================================================'
done
wait