Skip to content

Commit 5194dc5

Browse files
DavidGOrtegarestyled-commits0x2b3bfa0
authored
Runner Fix nvidia setup and restart (#607)
* Runner Fix nvidia setup and restart * Restyled by shfmt * fix tests * sh nitpick * revert go.mod * fix tests after nitpick * dialWithDeadline * Apply suggestions from code review * Apply suggestions from code review * Update golden tests Co-authored-by: Restyled.io <[email protected]> Co-authored-by: Helio Machado <[email protected]>
1 parent 27d0daa commit 5194dc5

File tree

7 files changed

+116
-66
lines changed

7 files changed

+116
-66
lines changed

environment/setup.sh

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ if [ ! -f "$FILE" ]; then
1717
sudo usermod -aG docker ubuntu
1818
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock
1919

20+
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
21+
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
22+
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
23+
2024
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
2125
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
2226
sudo apt update && sudo apt-get install -y terraform
@@ -25,16 +29,14 @@ if [ ! -f "$FILE" ]; then
2529
sudo apt update && sudo apt-get install -y nodejs
2630

2731
sudo apt install -y ubuntu-drivers-common
28-
sudo ubuntu-drivers autoinstall
29-
30-
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
31-
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
32-
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
33-
34-
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
35-
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
36-
sudo apt update && sudo apt install -y nvidia-docker2
37-
sudo systemctl restart docker
32+
if ubuntu-drivers devices | grep -q NVIDIA; then
33+
sudo ubuntu-drivers install
34+
35+
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
36+
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
37+
sudo apt update && sudo apt install -y nvidia-docker2
38+
sudo systemctl restart docker
39+
fi
3840

3941
echo OK | sudo tee "$FILE"
4042
fi

iterative/resource_runner.go

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ func resourceRunner() *schema.Resource {
3131
DeleteContext: resourceRunnerDelete,
3232
ReadContext: resourceMachineRead,
3333
Timeouts: &schema.ResourceTimeout{
34-
Create: schema.DefaultTimeout(10 * time.Minute),
34+
Create: schema.DefaultTimeout(20 * time.Minute),
3535
Update: schema.DefaultTimeout(10 * time.Minute),
3636
Delete: schema.DefaultTimeout(10 * time.Minute),
3737
},
@@ -247,27 +247,30 @@ func resourceRunnerCreate(ctx context.Context, d *schema.ResourceData, m interfa
247247

248248
var logError error
249249
var logEvents string
250-
err = resource.Retry(d.Timeout(schema.TimeoutCreate), func() *resource.RetryError {
251-
switch cloud := d.Get("cloud").(string); cloud {
250+
cloud := d.Get("cloud").(string)
251+
ip := d.Get("instance_ip").(string)
252+
err = resource.Retry(d.Timeout(schema.TimeoutCreate)-time.Minute, func() *resource.RetryError {
253+
254+
switch cloud {
252255
case "kubernetes":
253256
logEvents, logError = resourceMachineLogs(ctx, d, m)
254257
default:
255258
logEvents, logError = utils.RunCommand("journalctl --unit cml --no-pager",
256259
2*time.Second,
257-
net.JoinHostPort(d.Get("instance_ip").(string), "22"),
260+
net.JoinHostPort(ip, "22"),
258261
"ubuntu",
259262
d.Get("ssh_private").(string))
260263
}
261264

262-
log.Printf("[DEBUG] Collected log events: %#v", logEvents)
263-
log.Printf("[DEBUG] Connection errors: %#v", logError)
264-
265265
if logError != nil {
266-
return resource.RetryableError(fmt.Errorf("Waiting for the machine to accept connections... %s", logError))
267-
} else if utils.HasStatus(logEvents, "terminated") {
268-
return resource.NonRetryableError(fmt.Errorf("Failed to launch the runner!"))
269-
} else if utils.HasStatus(logEvents, "ready") {
270-
return nil
266+
log.Printf("[DEBUG] Connection errors: %#v", logError)
267+
} else {
268+
log.Printf("[DEBUG] Collected log events: %#v", logEvents)
269+
if utils.HasStatus(logEvents, "terminated") {
270+
return resource.NonRetryableError(fmt.Errorf("Failed to launch the runner!"))
271+
} else if utils.HasStatus(logEvents, "ready") {
272+
return nil
273+
}
271274
}
272275

273276
return resource.RetryableError(fmt.Errorf("Waiting for the runner to be ready..."))
@@ -374,9 +377,11 @@ EOF'
374377
{{- if .cloud}}
375378
sudo systemctl daemon-reload
376379
sudo systemctl enable cml.service
377-
{{- if .instance_gpu}}
378-
nvidia-smi &>/dev/null || reboot
379-
{{- end}}
380+
381+
if ubuntu-drivers devices | grep -q NVIDIA; then
382+
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
383+
fi
384+
380385
sudo systemctl start cml.service
381386
{{- end}}
382387

iterative/testdata/script_template_cloud_aws.golden

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ if [ ! -f "$FILE" ]; then
1818
sudo usermod -aG docker ubuntu
1919
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock
2020

21+
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
22+
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
23+
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
24+
2125
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
2226
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
2327
sudo apt update && sudo apt-get install -y terraform
@@ -26,16 +30,14 @@ if [ ! -f "$FILE" ]; then
2630
sudo apt update && sudo apt-get install -y nodejs
2731

2832
sudo apt install -y ubuntu-drivers-common
29-
sudo ubuntu-drivers autoinstall
33+
if ubuntu-drivers devices | grep -q NVIDIA; then
34+
sudo ubuntu-drivers install
3035

31-
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
32-
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
33-
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
34-
35-
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
36-
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
37-
sudo apt update && sudo apt install -y nvidia-docker2
38-
sudo systemctl restart docker
36+
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
37+
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
38+
sudo apt update && sudo apt install -y nvidia-docker2
39+
sudo systemctl restart docker
40+
fi
3941

4042
echo OK | sudo tee "$FILE"
4143
fi
@@ -73,5 +75,9 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
7375
EOF'
7476
sudo systemctl daemon-reload
7577
sudo systemctl enable cml.service
76-
nvidia-smi &>/dev/null || reboot
78+
79+
if ubuntu-drivers devices | grep -q NVIDIA; then
80+
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
81+
fi
82+
7783
sudo systemctl start cml.service

iterative/testdata/script_template_cloud_azure.golden

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ if [ ! -f "$FILE" ]; then
1818
sudo usermod -aG docker ubuntu
1919
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock
2020

21+
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
22+
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
23+
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
24+
2125
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
2226
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
2327
sudo apt update && sudo apt-get install -y terraform
@@ -26,16 +30,14 @@ if [ ! -f "$FILE" ]; then
2630
sudo apt update && sudo apt-get install -y nodejs
2731

2832
sudo apt install -y ubuntu-drivers-common
29-
sudo ubuntu-drivers autoinstall
33+
if ubuntu-drivers devices | grep -q NVIDIA; then
34+
sudo ubuntu-drivers install
3035

31-
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
32-
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
33-
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
34-
35-
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
36-
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
37-
sudo apt update && sudo apt install -y nvidia-docker2
38-
sudo systemctl restart docker
36+
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
37+
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
38+
sudo apt update && sudo apt install -y nvidia-docker2
39+
sudo systemctl restart docker
40+
fi
3941

4042
echo OK | sudo tee "$FILE"
4143
fi
@@ -74,5 +76,9 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
7476
EOF'
7577
sudo systemctl daemon-reload
7678
sudo systemctl enable cml.service
77-
nvidia-smi &>/dev/null || reboot
79+
80+
if ubuntu-drivers devices | grep -q NVIDIA; then
81+
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
82+
fi
83+
7884
sudo systemctl start cml.service

iterative/testdata/script_template_cloud_gcp.golden

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ if [ ! -f "$FILE" ]; then
1818
sudo usermod -aG docker ubuntu
1919
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock
2020

21+
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
22+
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
23+
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
24+
2125
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
2226
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
2327
sudo apt update && sudo apt-get install -y terraform
@@ -26,16 +30,14 @@ if [ ! -f "$FILE" ]; then
2630
sudo apt update && sudo apt-get install -y nodejs
2731

2832
sudo apt install -y ubuntu-drivers-common
29-
sudo ubuntu-drivers autoinstall
33+
if ubuntu-drivers devices | grep -q NVIDIA; then
34+
sudo ubuntu-drivers install
3035

31-
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
32-
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
33-
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
34-
35-
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
36-
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
37-
sudo apt update && sudo apt install -y nvidia-docker2
38-
sudo systemctl restart docker
36+
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
37+
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
38+
sudo apt update && sudo apt install -y nvidia-docker2
39+
sudo systemctl restart docker
40+
fi
3941

4042
echo OK | sudo tee "$FILE"
4143
fi
@@ -72,5 +74,9 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
7274
EOF'
7375
sudo systemctl daemon-reload
7476
sudo systemctl enable cml.service
75-
nvidia-smi &>/dev/null || reboot
77+
78+
if ubuntu-drivers devices | grep -q NVIDIA; then
79+
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
80+
fi
81+
7682
sudo systemctl start cml.service

iterative/testdata/script_template_cloud_invalid.golden

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ if [ ! -f "$FILE" ]; then
1818
sudo usermod -aG docker ubuntu
1919
sudo setfacl --modify user:ubuntu:rw /var/run/docker.sock
2020

21+
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
22+
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
23+
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
24+
2125
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
2226
sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
2327
sudo apt update && sudo apt-get install -y terraform
@@ -26,16 +30,14 @@ if [ ! -f "$FILE" ]; then
2630
sudo apt update && sudo apt-get install -y nodejs
2731

2832
sudo apt install -y ubuntu-drivers-common
29-
sudo ubuntu-drivers autoinstall
33+
if ubuntu-drivers devices | grep -q NVIDIA; then
34+
sudo ubuntu-drivers install
3035

31-
get_ecr_helper="curl https://amazon-ecr-credential-helper-releases.s3.us-east-2.amazonaws.com/0.5.0/linux-amd64/docker-credential-ecr-login --output /usr/bin/docker-credential-ecr-login"
32-
chmod_ecr_help="chmod 755 /usr/bin/docker-credential-ecr-login"
33-
sudo systemd-run --same-dir --no-block --service-type=exec bash -c "$get_ecr_help && $chmod_ecr_help"
34-
35-
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
36-
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
37-
sudo apt update && sudo apt install -y nvidia-docker2
38-
sudo systemctl restart docker
36+
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
37+
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu18.04/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
38+
sudo apt update && sudo apt install -y nvidia-docker2
39+
sudo systemctl restart docker
40+
fi
3941

4042
echo OK | sudo tee "$FILE"
4143
fi
@@ -70,5 +72,9 @@ sudo bash -c 'cat << EOF > /etc/systemd/system/cml.service
7072
EOF'
7173
sudo systemctl daemon-reload
7274
sudo systemctl enable cml.service
73-
nvidia-smi &>/dev/null || reboot
75+
76+
if ubuntu-drivers devices | grep -q NVIDIA; then
77+
(sudo modprobe nvidia && sudo nvidia-smi) || sudo reboot
78+
fi
79+
7480
sudo systemctl start cml.service

iterative/utils/ssh.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"crypto/x509"
77
"encoding/pem"
88
"fmt"
9+
"net"
910
"strings"
1011
"time"
1112

@@ -64,7 +65,7 @@ func RunCommand(command string, timeout time.Duration, hostAddress string, userN
6465
Timeout: timeout,
6566
}
6667

67-
client, err := ssh.Dial("tcp", hostAddress, configuration)
68+
client, err := dialWithDeadline("tcp", hostAddress, configuration)
6869
if err != nil {
6970
return "", err
7071
}
@@ -83,3 +84,21 @@ func RunCommand(command string, timeout time.Duration, hostAddress string, userN
8384

8485
return string(output), nil
8586
}
87+
88+
func dialWithDeadline(network string, addr string, config *ssh.ClientConfig) (*ssh.Client, error) {
89+
conn, err := net.DialTimeout(network, addr, config.Timeout)
90+
if err != nil {
91+
return nil, err
92+
}
93+
if config.Timeout > 0 {
94+
conn.SetReadDeadline(time.Now().Add(config.Timeout))
95+
}
96+
c, chans, reqs, err := ssh.NewClientConn(conn, addr, config)
97+
if err != nil {
98+
return nil, err
99+
}
100+
if config.Timeout > 0 {
101+
conn.SetReadDeadline(time.Time{})
102+
}
103+
return ssh.NewClient(c, chans, reqs), nil
104+
}

0 commit comments

Comments
 (0)