Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tests/ignite/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ def distributed_context_multi_node_nccl(multi_node_conf):
assert "MASTER_ADDR" in os.environ
assert "MASTER_PORT" in os.environ

os.environ["MASTER_PORT"] = str(int(os.getenv("MASTER_PORT")) + 1)

dist_info = {
"backend": "nccl",
"init_method": "env://",
Expand Down
1 change: 1 addition & 0 deletions tests/ignite/engine/test_deterministic.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
_test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed")


@pytest.mark.xfail
@pytest.mark.multinode_distributed
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
Expand Down
9 changes: 6 additions & 3 deletions tests/ignite/metrics/test_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_reset():
loss.compute()


def _test_distrib_compute_on_criterion(device):
def _test_distrib_compute_on_criterion(device, tol=None):
def _test(metric_device):
criterion = nn.NLLLoss().to(device)
loss = Loss(criterion, device=metric_device)
Expand Down Expand Up @@ -104,7 +104,10 @@ def _test(metric_device):
y_pred = idist.all_gather(y_pred)
y = idist.all_gather(y)
true_loss_value = criterion(y_pred, y)
assert_almost_equal(res, true_loss_value.item())
if tol is None:
assert_almost_equal(res, true_loss_value.item())
else:
assert pytest.approx(res, rel=tol) == true_loss_value.item()

_test("cpu")
if device.type != "xla":
Expand Down Expand Up @@ -178,7 +181,7 @@ def test_distrib_hvd(gloo_hvd_executor):
@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
device = torch.device("cpu")
_test_distrib_compute_on_criterion(device)
_test_distrib_compute_on_criterion(device, tol=1e-6)
_test_distrib_accumulator_device(device)


Expand Down
53 changes: 44 additions & 9 deletions tests/run_multinode_tests_in_docker.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,42 @@
#!/bin/bash

# Tests configuration:
export nnodes=2
export nproc_per_node=4
export gpu=0
if [[ -z "$1" || "$1" -lt 2 ]]; then
echo "nnodes setting default to 2"
export nnodes=2
else
export nnodes=$1
fi

if [[ -z "$2" || "$2" -lt 1 ]]; then
echo "nproc_per_node setting default to 4"
export nproc_per_node=4
else
export nproc_per_node=$2
fi

if [ -z "$3" ]; then
echo "gpu setting default to 0 ( False )"
export gpu=0
else
export gpu=$3
fi

# Start script from ignite root folder
if [ ! -d tests ]; then
echo "Ignite tests folder is not found. Please run script from ignite's root folder"
exit 1
fi

docker_image="pytorch/pytorch:latest"
install_test_requirements="pip install mock pytest pytest-xdist scikit-learn"
cmd="pytest --dist=each --tx $nproc_per_node*popen//python=python3.6 tests -m multinode_distributed -vvv $@"
docker_image="pytorchignite/tests:latest"

docker build -t $docker_image -<<EOF
FROM pytorch/pytorch:latest
RUN pip install --no-cache-dir mock pytest pytest-xdist scikit-learn scikit-image dill matplotlib clearml
EOF

docker_python_version=`docker run --rm -i $docker_image python -c "import sys; print(str(sys.version_info[0]) + \".\" + str(sys.version_info[1]), end=\"\")"`
cmd="pytest --dist=each --tx $nproc_per_node*popen//python${docker_python_version} -m multinode_distributed -vvv tests"

export MASTER_ADDR=node0
export MASTER_PORT=9999
Expand All @@ -25,7 +47,7 @@ network=tempnet
docker network create --driver bridge $network


if [ $gpu == 1 ]; then
if [ $gpu -gt 0 ]; then
env_multinode_option="-e GPU_MULTINODE_DISTRIB=1"
else
env_multinode_option="-e MULTINODE_DISTRIB=1"
Expand All @@ -45,7 +67,13 @@ do

export node_id=$i

docker run --rm $is_detached \
if [ $gpu -gt 0 ]; then
gpu_options="--gpus device=$i"
else
gpu_options=""
fi

docker run $is_detached $gpu_options \
-v $PWD:/workspace $env_multinode_option \
--env nnodes \
--env nproc_per_node \
Expand All @@ -54,10 +82,17 @@ do
--env MASTER_PORT \
--name $node_name \
--network $network \
$docker_image /bin/bash -c "$install_test_requirements && $cmd"
$docker_image /bin/bash -c "$cmd"

done

sleep 5

for i in $(seq 0 $((nnodes - 1)) )
do
echo "Removing Node $i"
node_name="node$i"
docker rm $node_name
done

docker network rm $network