forked from pytorch/ignite
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_multinode_tests_in_docker.sh
98 lines (75 loc) · 2.18 KB
/
run_multinode_tests_in_docker.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/bin/bash
# Tests configuration:
if [[ -z "$1" || "$1" -lt 2 ]]; then
echo "nnodes setting default to 2"
export nnodes=2
else
export nnodes=$1
fi
if [[ -z "$2" || "$2" -lt 1 ]]; then
echo "nproc_per_node setting default to 4"
export nproc_per_node=4
else
export nproc_per_node=$2
fi
if [ -z "$3" ]; then
echo "gpu setting default to 0 ( False )"
export gpu=0
else
export gpu=$3
fi
# Start script from ignite root folder
if [ ! -d tests ]; then
echo "Ignite tests folder is not found. Please run script from ignite's root folder"
exit 1
fi
docker_image="pytorchignite/tests:latest"
docker build -t $docker_image -<<EOF
FROM pytorch/pytorch:latest
RUN pip install --no-cache-dir mock pytest pytest-xdist scikit-learn scikit-image dill matplotlib clearml
EOF
docker_python_version=`docker run --rm -i $docker_image python -c "import sys; print(str(sys.version_info[0]) + \".\" + str(sys.version_info[1]), end=\"\")"`
cmd="pytest --dist=each --tx $nproc_per_node*popen//python${docker_python_version} -m multinode_distributed -vvv tests"
export MASTER_ADDR=node0
export MASTER_PORT=9999
network=tempnet
# Create user bridge network
docker network create --driver bridge $network
if [ $gpu -gt 0 ]; then
env_multinode_option="-e GPU_MULTINODE_DISTRIB=1"
else
env_multinode_option="-e MULTINODE_DISTRIB=1"
fi
for i in $(seq 0 $((nnodes - 1)) )
do
echo "Start Node $i"
node_name="node$i"
is_detached="-d"
if [ $i == $((nnodes - 1)) ]; then
is_detached=""
fi
export node_id=$i
if [ $gpu -gt 0 ]; then
gpu_options="--gpus device=$i"
else
gpu_options=""
fi
docker run $is_detached $gpu_options \
-v $PWD:/workspace $env_multinode_option \
--env nnodes \
--env nproc_per_node \
--env node_id \
--env MASTER_ADDR \
--env MASTER_PORT \
--name $node_name \
--network $network \
$docker_image /bin/bash -c "$cmd"
done
sleep 5
for i in $(seq 0 $((nnodes - 1)) )
do
echo "Removing Node $i"
node_name="node$i"
docker rm $node_name
done
docker network rm $network