Skip to content

Commit 0019d02

Browse files
jan-janssenpyiron-runnerpre-commit-ci[bot]
authored
Define error log file in resource_dict (#736)
* Define error log file in resource_dict * fix tests * Format black * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add docstrings * Add HDF test * test error function * new test --------- Co-authored-by: pyiron-runner <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent a807fd9 commit 0019d02

File tree

19 files changed

+145
-2
lines changed

19 files changed

+145
-2
lines changed

.github/workflows/pipeline.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ jobs:
402402
run: echo -e "channels:\n - conda-forge\n" > .condarc
403403
- uses: conda-incubator/setup-miniconda@v3
404404
with:
405-
python-version: '3.9'
405+
python-version: '3.10'
406406
miniforge-version: latest
407407
condarc-file: .condarc
408408
environment-file: .ci_support/environment-old.yml

executorlib/backend/cache_parallel.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import cloudpickle
66

7+
from executorlib.standalone.error import backend_write_error_file
78
from executorlib.task_scheduler.file.backend import (
89
backend_load_file,
910
backend_write_file,
@@ -53,6 +54,10 @@ def main() -> None:
5354
output={"error": error},
5455
runtime=time.time() - time_start,
5556
)
57+
backend_write_error_file(
58+
error=error,
59+
apply_dict=apply_dict,
60+
)
5661
else:
5762
if mpi_rank_zero:
5863
backend_write_file(

executorlib/backend/interactive_parallel.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import cloudpickle
77
import zmq
88

9+
from executorlib.standalone.error import backend_write_error_file
910
from executorlib.standalone.interactive.backend import call_funct, parse_arguments
1011
from executorlib.standalone.interactive.communication import (
1112
interface_connect,
@@ -82,6 +83,10 @@ def main() -> None:
8283
socket=socket,
8384
result_dict={"error": error},
8485
)
86+
backend_write_error_file(
87+
error=error,
88+
apply_dict=input_dict,
89+
)
8590
else:
8691
# Send output
8792
if mpi_rank_zero:

executorlib/backend/interactive_serial.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from os.path import abspath
33
from typing import Optional
44

5+
from executorlib.standalone.error import backend_write_error_file
56
from executorlib.standalone.interactive.backend import call_funct, parse_arguments
67
from executorlib.standalone.interactive.communication import (
78
interface_connect,
@@ -58,6 +59,10 @@ def main(argument_lst: Optional[list[str]] = None):
5859
socket=socket,
5960
result_dict={"error": error},
6061
)
62+
backend_write_error_file(
63+
error=error,
64+
apply_dict=input_dict,
65+
)
6166
else:
6267
# Send output
6368
interface_send(socket=socket, result_dict={"result": output})

executorlib/executor/flux.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ class FluxJobExecutor(BaseExecutor):
4141
Defaults to None.
4242
- exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing
4343
compute notes. Defaults to False.
44+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
45+
by the Python functions submitted to the Executor.
4446
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
4547
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
4648
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
@@ -126,6 +128,8 @@ def __init__(
126128
Defaults to None.
127129
- exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing
128130
compute notes. Defaults to False.
131+
- error_log_file (str): Name of the error log file to use for storing exceptions
132+
raised by the Python functions submitted to the Executor.
129133
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
130134
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
131135
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.
@@ -229,6 +233,8 @@ class FluxClusterExecutor(BaseExecutor):
229233
- openmpi_oversubscribe (bool): adds the `--oversubscribe` command line flag (OpenMPI and
230234
SLURM only) - default False
231235
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only)
236+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
237+
by the Python functions submitted to the Executor.
232238
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
233239
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
234240
context of an HPC cluster this essential to be able to communicate to an
@@ -308,6 +314,8 @@ def __init__(
308314
and SLURM only) - default False
309315
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM
310316
only)
317+
- error_log_file (str): Name of the error log file to use for storing exceptions
318+
raised by the Python functions submitted to the Executor.
311319
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
312320
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
313321
context of an HPC cluster this essential to be able to communicate to an
@@ -424,6 +432,8 @@ def create_flux_executor(
424432
Defaults to None.
425433
- exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing
426434
compute notes. Defaults to False.
435+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
436+
by the Python functions submitted to the Executor.
427437
flux_executor (flux.job.FluxExecutor): Flux Python interface to submit the workers to flux
428438
flux_executor_pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None (Flux only)
429439
flux_executor_nesting (bool): Provide hierarchically nested Flux job scheduler inside the submitted function.

executorlib/executor/single.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ class SingleNodeExecutor(BaseExecutor):
3939
- openmpi_oversubscribe (bool): adds the `--oversubscribe` command line flag (OpenMPI and
4040
SLURM only) - default False
4141
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only)
42+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
43+
by the Python functions submitted to the Executor.
4244
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
4345
context of an HPC cluster this essential to be able to communicate to an
4446
Executor running on a different compute node within the same allocation. And
@@ -116,6 +118,8 @@ def __init__(
116118
and SLURM only) - default False
117119
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM
118120
only)
121+
- error_log_file (str): Name of the error log file to use for storing exceptions
122+
raised by the Python functions submitted to the Executor.
119123
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
120124
context of an HPC cluster this essential to be able to communicate to an
121125
Executor running on a different compute node within the same allocation. And
@@ -202,6 +206,8 @@ class TestClusterExecutor(BaseExecutor):
202206
- threads_per_core (int): number of OpenMP threads to be used for each function call
203207
- gpus_per_core (int): number of GPUs per worker - defaults to 0
204208
- cwd (str/None): current working directory where the parallel python task is executed
209+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
210+
by the Python functions submitted to the Executor.
205211
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
206212
context of an HPC cluster this essential to be able to communicate to an
207213
Executor running on a different compute node within the same allocation. And
@@ -273,6 +279,8 @@ def __init__(
273279
- threads_per_core (int): number of OpenMP threads to be used for each function call
274280
- gpus_per_core (int): number of GPUs per worker - defaults to 0
275281
- cwd (str/None): current working directory where the parallel python task is executed
282+
- error_log_file (str): Name of the error log file to use for storing exceptions
283+
raised by the Python functions submitted to the Executor.
276284
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
277285
context of an HPC cluster this essential to be able to communicate to an
278286
Executor running on a different compute node within the same allocation. And
@@ -381,6 +389,8 @@ def create_single_node_executor(
381389
and SLURM only) - default False
382390
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM
383391
only)
392+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
393+
by the Python functions submitted to the Executor.
384394
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
385395
context of an HPC cluster this essential to be able to communicate to an
386396
Executor running on a different compute node within the same allocation. And

executorlib/executor/slurm.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ class SlurmClusterExecutor(BaseExecutor):
4141
- openmpi_oversubscribe (bool): adds the `--oversubscribe` command line flag (OpenMPI and
4242
SLURM only) - default False
4343
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM only)
44+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
45+
by the Python functions submitted to the Executor.
4446
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
4547
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
4648
context of an HPC cluster this essential to be able to communicate to an
@@ -120,6 +122,8 @@ def __init__(
120122
and SLURM only) - default False
121123
- slurm_cmd_args (list): Additional command line arguments for the srun call (SLURM
122124
only)
125+
- error_log_file (str): Name of the error log file to use for storing exceptions
126+
raised by the Python functions submitted to the Executor.
123127
pysqa_config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend).
124128
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
125129
context of an HPC cluster this essential to be able to communicate to an
@@ -226,6 +230,8 @@ class SlurmJobExecutor(BaseExecutor):
226230
Defaults to None.
227231
- exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing
228232
compute notes. Defaults to False.
233+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
234+
by the Python functions submitted to the Executor.
229235
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
230236
context of an HPC cluster this essential to be able to communicate to an
231237
Executor running on a different compute node within the same allocation. And
@@ -307,6 +313,8 @@ def __init__(
307313
Defaults to None.
308314
- exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing
309315
compute notes. Defaults to False.
316+
- error_log_file (str): Name of the error log file to use for storing exceptions
317+
raised by the Python functions submitted to the Executor.
310318
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
311319
context of an HPC cluster this essential to be able to communicate to an
312320
Executor running on a different compute node within the same allocation. And
@@ -408,6 +416,8 @@ def create_slurm_executor(
408416
Defaults to None.
409417
- exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing
410418
compute notes. Defaults to False.
419+
- error_log_file (str): Name of the error log file to use for storing exceptions raised
420+
by the Python functions submitted to the Executor.
411421
hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the
412422
context of an HPC cluster this essential to be able to communicate to an
413423
Executor running on a different compute node within the same allocation. And

executorlib/standalone/cache.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
"error": "error",
1111
"runtime": "runtime",
1212
"queue_id": "queue_id",
13+
"error_log_file": "error_log_file",
1314
}
1415

1516

executorlib/standalone/error.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import traceback
2+
3+
4+
def backend_write_error_file(error: Exception, apply_dict: dict) -> None:
5+
"""
6+
Write an error to a file if specified in the apply_dict.
7+
8+
Args:
9+
error (Exception): The error to be written.
10+
apply_dict (dict): Dictionary containing additional parameters.
11+
12+
Returns:
13+
None
14+
"""
15+
error_log_file = apply_dict.get("error_log_file")
16+
if error_log_file is not None:
17+
with open(error_log_file, "a") as f:
18+
f.write("function: " + str(apply_dict["fn"]) + "\n")
19+
f.write("args: " + str(apply_dict["args"]) + "\n")
20+
f.write("kwargs: " + str(apply_dict["kwargs"]) + "\n")
21+
traceback.print_exception(error, file=f)

executorlib/task_scheduler/file/backend.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import time
33
from typing import Any
44

5+
from executorlib.standalone.error import backend_write_error_file
56
from executorlib.task_scheduler.file.hdf import dump, load
67
from executorlib.task_scheduler.file.shared import FutureItem
78

@@ -77,6 +78,10 @@ def backend_execute_task_in_file(file_name: str) -> None:
7778
}
7879
except Exception as error:
7980
result = {"error": error}
81+
backend_write_error_file(
82+
error=error,
83+
apply_dict=apply_dict,
84+
)
8085

8186
backend_write_file(
8287
file_name=file_name,

0 commit comments

Comments
 (0)