JuliaParallel · DilumAluthge · May 1, 2025 · Apr 29, 2025
diff --git a/test/error_path_intentionally_fail.jl b/test/error_path_intentionally_fail.jl
@@ -0,0 +1,38 @@
+mktempdir() do tmpdir
+    fake_bindir = joinpath(tmpdir, "bin")
+    fake_srun = joinpath(tmpdir, "bin", "srun")
+    mkpath(fake_bindir)
+    open(fake_srun, "w") do io
+      println(io, "#!/usr/bin/env bash")
+      println(io, "set -euf -o pipefail")
+      # println(io, "set -x")
+      println(io, "echo [stdout] fake-srun: INTENTIONALLY ERROR-ING")
+      println(io, "echo [stderr] fake-srun: INTENTIONALLY ERROR-ING >&2")
+      println(io, "exit 1")
+    end
+    chmod(fake_srun, 0o700) # chmod +x
+    directory_separator = Sys.iswindows() ? ';' : ':'
+    new_env = Dict{String, String}()
+    new_env["SLURM_NTASKS"] = "8"
+    new_env["SLURM_JOB_ID"] = "1234"
+    if haskey(ENV, "PATH")
+      old_path = ENV["PATH"]
+      new_env["PATH"] = fake_bindir * directory_separator * old_path
+    else
+      new_env["PATH"] = fake_bindir
+    end
+
+    @info "with old PATH" Sys.which("srun")
+    withenv(new_env...) do
+      @info "with new PATH" Sys.which("srun")
+
+      if Base.VERSION >= v"1.2-"
+        T_expected = TaskFailedException
+      else
+        T_expected = Base.IOError
+      end
+
+      mgr = SlurmClusterManager.SlurmManager()
+      @test_throws T_expected Distributed.addprocs(mgr)
+    end
+  end
diff --git a/test/error_path_manager_timeout.jl b/test/error_path_manager_timeout.jl
@@ -0,0 +1,54 @@
+mktempdir() do tmpdir
+    fake_bindir = joinpath(tmpdir, "bin")
+    fake_srun = joinpath(tmpdir, "bin", "srun")
+    mkpath(fake_bindir)
+    open(fake_srun, "w") do io
+      println(io, "#!/usr/bin/env bash")
+      println(io, "set -euf -o pipefail")
+      # println(io, "set -x")
+
+      # we only print this to stderr; don't print to stdout, or we won't hit the desired error path
+      # (we'll hit a different error path instead, not the one we want to test)
+      println(io, "echo [stderr] fake-srun: sleeping for 15 seconds... >&2")
+
+      # Bash sleep for 15-seconds:
+      println(io, "sleep 15")
+
+      println(io, "echo [stdout] fake-srun: INTENTIONALLY ERROR-ING")
+      println(io, "echo [stderr] fake-srun: INTENTIONALLY ERROR-ING >&2")
+      println(io, "exit 1")
+    end
+    chmod(fake_srun, 0o700) # chmod +x
+    directory_separator = Sys.iswindows() ? ';' : ':'
+    new_env = Dict{String, String}()
+    new_env["SLURM_NTASKS"] = "8"
+    new_env["SLURM_JOB_ID"] = "1234"
+    if haskey(ENV, "PATH")
+      old_path = ENV["PATH"]
+      new_env["PATH"] = fake_bindir * directory_separator * old_path
+    else
+      new_env["PATH"] = fake_bindir
+    end
+
+    @info "with old PATH" Sys.which("srun")
+    withenv(new_env...) do
+      @info "with new PATH" Sys.which("srun")
+
+      if Base.VERSION >= v"1.2-"
+        expected_outer_ex_T = TaskFailedException
+        expected_inner_ex_INSTANCE = ErrorException("launch_timeout exceeded")
+      else
+        expected_outer_ex_T = ErrorException
+        expected_inner_ex_INSTANCE = ErrorException("launch_timeout exceeded")
+      end
+
+      mgr = SlurmClusterManager.SlurmManager(; launch_timeout = 2.0)
+      test_result = @test_throws expected_outer_ex_T Distributed.addprocs(mgr)
+
+      cfg = ConfigForTestingTaskFailedException(;
+        expected_outer_ex_T=expected_outer_ex_T,
+        expected_inner_ex_INSTANCE=expected_inner_ex_INSTANCE,
+      )
+      test_task_failed_exception(test_result, cfg)
+    end
+  end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,7 +6,7 @@ import Distributed
 import Test
 
 # Bring some names into scope, just for convenience:
-using Test: @testset, @test, @test_logs
+using Test: @testset, @test, @test_throws, @test_logs, @test_skip, @test_broken
 
 const original_JULIA_DEBUG = strip(get(ENV, "JULIA_DEBUG", ""))
 if isempty(original_JULIA_DEBUG)
@@ -73,3 +73,14 @@ end # testset "SlurmClusterManager.jl"
     )
   end
 end
+
+include("util.jl")
+
+@testset "Test some unhappy paths (error paths)" begin
+  @testset "intentionally fail" begin
+    include("error_path_intentionally_fail.jl")
+  end
+  @testset "manager's launch timeout" begin
+    include("error_path_manager_timeout.jl")
+  end
+end
diff --git a/test/util.jl b/test/util.jl
@@ -0,0 +1,27 @@
+extract_test_result_value(test_result::Test.Pass) = test_result.value
+
+recursively_unwrap_ex(ex::ErrorException) = ex
+recursively_unwrap_ex(ex::Base.IOError) = ex
+
+@static if Base.VERSION >= v"1.2-"
+  function recursively_unwrap_ex(outer_ex::TaskFailedException)
+    new_thing = outer_ex.task.exception
+    return recursively_unwrap_ex(new_thing)
+  end
+end
+
+Base.@kwdef struct ConfigForTestingTaskFailedException
+  expected_outer_ex_T
+  expected_inner_ex_INSTANCE
+end
+
+function test_task_failed_exception(test_result::Test.Pass, cfg::ConfigForTestingTaskFailedException)
+  observed_outer_ex = extract_test_result_value(test_result)
+  @test observed_outer_ex isa cfg.expected_outer_ex_T
+
+  observed_inner_ex = recursively_unwrap_ex(observed_outer_ex)
+  @test observed_inner_ex isa typeof(cfg.expected_inner_ex_INSTANCE)
+  @test observed_inner_ex == cfg.expected_inner_ex_INSTANCE
+
+  return nothing
+end