|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +set -euox pipefail |
| 4 | + |
| 5 | +if [[ $# -lt 3 ]]; then |
| 6 | + echo "Please provide the number of nodes and GPU per node." |
| 7 | + exit 1 |
| 8 | +fi |
| 9 | + |
| 10 | +NUM_NODES=$1 |
| 11 | +NUM_GPUS=$2 |
| 12 | +DOCKER_IMAGE=$3 |
| 13 | + |
| 14 | +shift 3 |
| 15 | +COMMANDS=("$@") |
| 16 | +if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then |
| 17 | + echo "The number of commands must be equal to the number of nodes." |
| 18 | + echo "Number of nodes: $NUM_NODES" |
| 19 | + echo "Number of commands: ${#COMMANDS[@]}" |
| 20 | + exit 1 |
| 21 | +fi |
| 22 | + |
| 23 | +echo "List of commands" |
| 24 | +for command in "${COMMANDS[@]}"; do |
| 25 | + echo $command |
| 26 | +done |
| 27 | + |
| 28 | +start_network() { |
| 29 | + docker network create --subnet=192.168.10.0/24 docker-net |
| 30 | +} |
| 31 | + |
| 32 | +start_nodes() { |
| 33 | + for node in $(seq 0 $(($NUM_NODES-1))); do |
| 34 | + GPU_DEVICES='"device=' |
| 35 | + for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do |
| 36 | + DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) |
| 37 | + GPU_DEVICES+=$(($DEVICE_NUM)) |
| 38 | + if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then |
| 39 | + GPU_DEVICES+=',' |
| 40 | + fi |
| 41 | + done |
| 42 | + GPU_DEVICES+='"' |
| 43 | + # echo "Starting node$node with GPU devices: $GPU_DEVICES" |
| 44 | + docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null |
| 45 | + done |
| 46 | +} |
| 47 | + |
| 48 | +run_nodes() { |
| 49 | + for node in $(seq 0 $(($NUM_NODES-1))); do |
| 50 | + GPU_DEVICES='"device=' |
| 51 | + for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do |
| 52 | + DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) |
| 53 | + GPU_DEVICES+=$(($DEVICE_NUM)) |
| 54 | + if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then |
| 55 | + GPU_DEVICES+=',' |
| 56 | + fi |
| 57 | + done |
| 58 | + GPU_DEVICES+='"' |
| 59 | + echo "Running node$node with GPU devices: $GPU_DEVICES" |
| 60 | + if [ $node -lt $(($NUM_NODES - 1)) ]; then |
| 61 | + docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}" |
| 62 | + else |
| 63 | + docker exec node$node /bin/bash -c "${COMMANDS[$node]}" |
| 64 | + fi |
| 65 | + done |
| 66 | +} |
| 67 | +cleanup() { |
| 68 | + for node in $(seq 0 $(($NUM_NODES-1))); do |
| 69 | + docker stop node$node |
| 70 | + done |
| 71 | + docker network rm docker-net |
| 72 | +} |
| 73 | +trap cleanup EXIT |
| 74 | +start_network |
| 75 | +start_nodes |
| 76 | +run_nodes |
| 77 | + |
0 commit comments