|
| 1 | +"""Parallel workflow execution via OAR http://oar.imag.fr |
| 2 | +""" |
| 3 | + |
| 4 | +import os |
| 5 | +import stat |
| 6 | +from time import sleep |
| 7 | +import subprocess |
| 8 | +import json |
| 9 | + |
| 10 | +from .base import (SGELikeBatchManagerBase, logger, iflogger, logging) |
| 11 | + |
| 12 | +from nipype.interfaces.base import CommandLine |
| 13 | + |
| 14 | + |
| 15 | +class OARPlugin(SGELikeBatchManagerBase): |
| 16 | + """Execute using OAR |
| 17 | +
|
| 18 | + The plugin_args input to run can be used to control the OAR execution. |
| 19 | + Currently supported options are: |
| 20 | +
|
| 21 | + - template : template to use for batch job submission |
| 22 | + - oarsub_args : arguments to be prepended to the job execution |
| 23 | + script in the oarsub call |
| 24 | + - max_jobname_len: maximum length of the job name. Default 15. |
| 25 | +
|
| 26 | + """ |
| 27 | + |
| 28 | + # Addtional class variables |
| 29 | + _max_jobname_len = 15 |
| 30 | + _oarsub_args = '' |
| 31 | + |
| 32 | + def __init__(self, **kwargs): |
| 33 | + template = """ |
| 34 | +# oarsub -J |
| 35 | + """ |
| 36 | + self._retry_timeout = 2 |
| 37 | + self._max_tries = 2 |
| 38 | + self._max_jobname_length = 15 |
| 39 | + if 'plugin_args' in kwargs and kwargs['plugin_args']: |
| 40 | + if 'retry_timeout' in kwargs['plugin_args']: |
| 41 | + self._retry_timeout = kwargs['plugin_args']['retry_timeout'] |
| 42 | + if 'max_tries' in kwargs['plugin_args']: |
| 43 | + self._max_tries = kwargs['plugin_args']['max_tries'] |
| 44 | + if 'max_jobname_len' in kwargs['plugin_args']: |
| 45 | + self._max_jobname_len = \ |
| 46 | + kwargs['plugin_args']['max_jobname_len'] |
| 47 | + super(OARPlugin, self).__init__(template, **kwargs) |
| 48 | + |
| 49 | + def _is_pending(self, taskid): |
| 50 | + # subprocess.Popen requires taskid to be a string |
| 51 | + proc = subprocess.Popen( |
| 52 | + ['oarstat', '-J', '-s', |
| 53 | + '-j', taskid], |
| 54 | + stdout=subprocess.PIPE, |
| 55 | + stderr=subprocess.PIPE |
| 56 | + ) |
| 57 | + o, e = proc.communicate() |
| 58 | + parsed_result = json.loads(o)[taskid].lower() |
| 59 | + is_pending = ( |
| 60 | + ('error' not in parsed_result) and |
| 61 | + ('terminated' not in parsed_result) |
| 62 | + ) |
| 63 | + return is_pending |
| 64 | + |
| 65 | + def _submit_batchtask(self, scriptfile, node): |
| 66 | + cmd = CommandLine('oarsub', environ=os.environ.data, |
| 67 | + terminal_output='allatonce') |
| 68 | + path = os.path.dirname(scriptfile) |
| 69 | + oarsubargs = '' |
| 70 | + if self._oarsub_args: |
| 71 | + oarsubargs = self._oarsub_args |
| 72 | + if 'oarsub_args' in node.plugin_args: |
| 73 | + if ( |
| 74 | + 'overwrite' in node.plugin_args and |
| 75 | + node.plugin_args['overwrite'] |
| 76 | + ): |
| 77 | + oarsubargs = node.plugin_args['oarsub_args'] |
| 78 | + else: |
| 79 | + oarsubargs += (" " + node.plugin_args['oarsub_args']) |
| 80 | + |
| 81 | + if node._hierarchy: |
| 82 | + jobname = '.'.join((os.environ.data['LOGNAME'], |
| 83 | + node._hierarchy, |
| 84 | + node._id)) |
| 85 | + else: |
| 86 | + jobname = '.'.join((os.environ.data['LOGNAME'], |
| 87 | + node._id)) |
| 88 | + jobnameitems = jobname.split('.') |
| 89 | + jobnameitems.reverse() |
| 90 | + jobname = '.'.join(jobnameitems) |
| 91 | + jobname = jobname[0:self._max_jobname_len] |
| 92 | + |
| 93 | + if '-O' not in oarsubargs: |
| 94 | + oarsubargs = '%s -O %s' % ( |
| 95 | + oarsubargs, |
| 96 | + os.path.join(path, jobname + '.stdout') |
| 97 | + ) |
| 98 | + if '-E' not in oarsubargs: |
| 99 | + oarsubargs = '%s -E %s' % ( |
| 100 | + oarsubargs, |
| 101 | + os.path.join(path, jobname + '.stderr') |
| 102 | + ) |
| 103 | + if '-J' not in oarsubargs: |
| 104 | + oarsubargs = '%s -J' % (oarsubargs) |
| 105 | + |
| 106 | + os.chmod(scriptfile, stat.S_IEXEC | stat.S_IREAD | stat.S_IWRITE) |
| 107 | + cmd.inputs.args = '%s -n %s -S %s' % ( |
| 108 | + oarsubargs, |
| 109 | + jobname, |
| 110 | + scriptfile |
| 111 | + ) |
| 112 | + |
| 113 | + oldlevel = iflogger.level |
| 114 | + iflogger.setLevel(logging.getLevelName('CRITICAL')) |
| 115 | + tries = 0 |
| 116 | + while True: |
| 117 | + try: |
| 118 | + result = cmd.run() |
| 119 | + except Exception as e: |
| 120 | + if tries < self._max_tries: |
| 121 | + tries += 1 |
| 122 | + sleep(self._retry_timeout) |
| 123 | + # sleep 2 seconds and try again. |
| 124 | + else: |
| 125 | + iflogger.setLevel(oldlevel) |
| 126 | + raise RuntimeError('\n'.join((('Could not submit OAR task' |
| 127 | + ' for node %s') % node._id, |
| 128 | + str(e)))) |
| 129 | + else: |
| 130 | + break |
| 131 | + iflogger.setLevel(oldlevel) |
| 132 | + # retrieve OAR taskid |
| 133 | + |
| 134 | + o = '' |
| 135 | + add = False |
| 136 | + for line in result.runtime.stdout.splitlines(): |
| 137 | + if line.strip().startswith('{'): |
| 138 | + add = True |
| 139 | + if add: |
| 140 | + o += line + '\n' |
| 141 | + if line.strip().startswith('}'): |
| 142 | + break |
| 143 | + taskid = json.loads(o)['job_id'] |
| 144 | + self._pending[taskid] = node.output_dir() |
| 145 | + logger.debug('submitted OAR task: %s for node %s' % (taskid, node._id)) |
| 146 | + return taskid |
0 commit comments