diff --git a/single_stage_detector/ssd/run_and_time.sh b/single_stage_detector/ssd/run_and_time.sh index 85d6e110d..cdf55e3ba 100755 --- a/single_stage_detector/ssd/run_and_time.sh +++ b/single_stage_detector/ssd/run_and_time.sh @@ -38,6 +38,7 @@ NUMEPOCHS=${NUMEPOCHS:-30} LOG_INTERVAL=${LOG_INTERVAL:-20} DATASET_DIR=${DATASET_DIR:-"/datasets/open-images-v6-mlperf"} TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"} +DGXNGPU=${DGXNGPU:-1} # Handle MLCube parameters while [ $# -gt 0 ]; do @@ -76,7 +77,7 @@ if [ -n "${SLURM_LOCALID-}" ]; then fi else # Mode 2: Single-node Docker; need to launch tasks with torchrun - CMD=( "torchrun" "--standalone" "--nnodes=1" "--nproc_per_node=1" ) + CMD=( "torchrun" "--standalone" "--nnodes=1" "--nproc_per_node=${DGXNGPU}" ) [ "$MEMBIND" = false ] && CMD+=( "--no_membind" ) fi