diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7b15d44..cfbfb2a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,6 +9,18 @@ repos: exclude: meta.yaml - id: debug-statements - id: end-of-file-fixer +- repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.7.0 # Use the ref you want to point at + hooks: + - id: python-check-blanket-noqa + - id: python-check-mock-methods + - id: python-no-eval + - id: python-no-log-warn + - id: python-use-type-annotations + - id: rst-backticks + - id: rst-directive-colons + - id: rst-inline-touching-normal + - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade rev: v2.7.4 hooks: diff --git a/CHANGES.rst b/CHANGES.rst index a143a12..df0f778 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,6 +11,7 @@ all releases are available on `Anaconda.org ------------------ - :gh:`5` fixes the CI and other smaller issues. +- :gh:`8` aligns pytask-parallel with task priorities in pytask v0.0.11. - :gh:`9` enables --max-failures. Closes :gh:`7`. diff --git a/README.rst b/README.rst index 395d5d7..e7e6924 100644 --- a/README.rst +++ b/README.rst @@ -18,8 +18,8 @@ pytask-parallel =============== -Parallelize the execution of tasks with `pytask-parallel` which is a plugin for `pytask -`_. +Parallelize the execution of tasks with ``pytask-parallel`` which is a plugin for +`pytask `_. Installation diff --git a/src/pytask_parallel/__init__.py b/src/pytask_parallel/__init__.py index 81f0fde..4289ec0 100644 --- a/src/pytask_parallel/__init__.py +++ b/src/pytask_parallel/__init__.py @@ -1 +1,2 @@ +"""The entry-point for pytask-parallel.""" __version__ = "0.0.4" diff --git a/src/pytask_parallel/execute.py b/src/pytask_parallel/execute.py index 97fbcef..031df03 100644 --- a/src/pytask_parallel/execute.py +++ b/src/pytask_parallel/execute.py @@ -3,11 +3,9 @@ import time import cloudpickle -import networkx as nx from _pytask.config import hookimpl from _pytask.report import ExecutionReport from pytask_parallel.backends import PARALLEL_BACKENDS -from pytask_parallel.scheduler import TopologicalSorter @hookimpl @@ -19,23 +17,6 @@ def pytask_post_parse(config): config["pm"].register(DefaultBackendNameSpace) -@hookimpl(tryfirst=True) -def pytask_execute_create_scheduler(session): - """Create the scheduler.""" - if session.config["n_workers"] > 1: - task_names = {task.name for task in session.tasks} - task_dict = { - name: nx.ancestors(session.dag, name) & task_names for name in task_names - } - scheduler = TopologicalSorter(task_dict) - - # Forbid to add further nodes and check for cycles. The latter should have been - # taken care of while setting up the DAG. - scheduler.prepare() - - return scheduler - - @hookimpl(tryfirst=True) def pytask_execute_build(session): """Execute tasks with a parallel backend. @@ -61,7 +42,12 @@ def pytask_execute_build(session): while session.scheduler.is_active(): newly_collected_reports = [] - ready_tasks = list(session.scheduler.get_ready()) + n_new_tasks = session.config["n_workers"] - len(running_tasks) + + if n_new_tasks >= 1: + ready_tasks = list(session.scheduler.get_ready(n_new_tasks)) + else: + ready_tasks = [] for task_name in ready_tasks: task = session.dag.nodes[task_name]["task"] @@ -132,6 +118,8 @@ def pytask_execute_build(session): class ProcessesNameSpace: + """The name space for hooks related to processes.""" + @hookimpl(tryfirst=True) def pytask_execute_task(session, task): # noqa: N805 """Execute a task. @@ -156,6 +144,8 @@ def unserialize_and_execute_task(bytes_): class DefaultBackendNameSpace: + """The name space for hooks related to threads.""" + @hookimpl(tryfirst=True) def pytask_execute_task(session, task): # noqa: N805 """Execute a task. diff --git a/src/pytask_parallel/scheduler.py b/src/pytask_parallel/scheduler.py deleted file mode 100644 index c63b243..0000000 --- a/src/pytask_parallel/scheduler.py +++ /dev/null @@ -1,271 +0,0 @@ -"""This module contains the code for the topological sorter. - -The topological sorter was implemented in Python 3.9 and this is straight copied from -the Python source. Maybe a backport will make this module unnecessary. - -.. [1]: https://github.com/python/cpython/blob/master/Lib/graphlib.py - -""" -__all__ = ["TopologicalSorter", "CycleError"] - -_NODE_OUT = -1 -_NODE_DONE = -2 - - -class _NodeInfo: - __slots__ = "node", "npredecessors", "successors" - - def __init__(self, node): - # The node this class is augmenting. - self.node = node - - # Number of predecessors, generally >= 0. When this value falls to 0, - # and is returned by get_ready(), this is set to _NODE_OUT and when the - # node is marked done by a call to done(), set to _NODE_DONE. - self.npredecessors = 0 - - # List of successor nodes. The list can contain duplicated elements as - # long as they're all reflected in the successor's npredecessors attribute). - self.successors = [] - - -class CycleError(ValueError): - """Subclass of ValueError raised by TopologicalSorter if cycles exist in the graph - - If multiple cycles exist, only one undefined choice among them will be reported - and included in the exception. The detected cycle can be accessed via the second - element in the *args* attribute of the exception instance and consists in a list - of nodes, such that each node is, in the graph, an immediate predecessor of the - next node in the list. In the reported list, the first and the last node will be - the same, to make it clear that it is cyclic. - - """ - - pass - - -class TopologicalSorter: - """Provides functionality to topologically sort a graph of hashable nodes""" - - def __init__(self, graph=None): - self._node2info = {} - self._ready_nodes = None - self._npassedout = 0 - self._nfinished = 0 - - if graph is not None: - for node, predecessors in graph.items(): - self.add(node, *predecessors) - - def _get_nodeinfo(self, node): - result = self._node2info.get(node) - if result is None: - self._node2info[node] = result = _NodeInfo(node) - return result - - def add(self, node, *predecessors): - """Add a new node and its predecessors to the graph. - - Both the *node* and all elements in *predecessors* must be hashable. - - If called multiple times with the same node argument, the set of dependencies - will be the union of all dependencies passed in. - - It is possible to add a node with no dependencies (*predecessors* is not - provided) as well as provide a dependency twice. If a node that has not been - provided before is included among *predecessors* it will be automatically added - to the graph with no predecessors of its own. - - Raises - ------ - ValueError - If called after "prepare". - - """ - if self._ready_nodes is not None: - raise ValueError("Nodes cannot be added after a call to prepare()") - - # Create the node -> predecessor edges - nodeinfo = self._get_nodeinfo(node) - nodeinfo.npredecessors += len(predecessors) - - # Create the predecessor -> node edges - for pred in predecessors: - pred_info = self._get_nodeinfo(pred) - pred_info.successors.append(node) - - def prepare(self): - """Mark the graph as finished and check for cycles in the graph. - - If any cycle is detected, "CycleError" will be raised, but "get_ready" can - still be used to obtain as many nodes as possible until cycles block more - progress. After a call to this function, the graph cannot be modified and - therefore no more nodes can be added using "add". - - """ - if self._ready_nodes is not None: - raise ValueError("cannot prepare() more than once") - - self._ready_nodes = [ - i.node for i in self._node2info.values() if i.npredecessors == 0 - ] - # ready_nodes is set before we look for cycles on purpose: - # if the user wants to catch the CycleError, that's fine, - # they can continue using the instance to grab as many - # nodes as possible before cycles block more progress - cycle = self._find_cycle() - if cycle: - raise CycleError(f"nodes are in a cycle {cycle}") - - def get_ready(self): - """Return a tuple of all the nodes that are ready. - - Initially it returns all nodes with no predecessors; once those are marked as - processed by calling "done", further calls will return all new nodes that have - all their predecessors already processed. Once no more progress can be made, - empty tuples are returned. - - Raises - ------ - ValueError - If called without calling "prepare" previously. - - """ - if self._ready_nodes is None: - raise ValueError("prepare() must be called first") - - # Get the nodes that are ready and mark them - result = tuple(self._ready_nodes) - n2i = self._node2info - for node in result: - n2i[node].npredecessors = _NODE_OUT - - # Clean the list of nodes that are ready and update - # the counter of nodes that we have returned. - self._ready_nodes.clear() - self._npassedout += len(result) - - return result - - def is_active(self): - """Return True if more progress can be made and ``False`` otherwise. - - Progress can be made if cycles do not block the resolution and either there - are still nodes ready that haven't yet been returned by "get_ready" or the - number of nodes marked "done" is less than the number that have been returned - by "get_ready". - - Raises - ------ - ValueError - If called without calling "prepare" previously. - - """ - if self._ready_nodes is None: - raise ValueError("prepare() must be called first") - return self._nfinished < self._npassedout or bool(self._ready_nodes) - - def __bool__(self): - return self.is_active() - - def done(self, *nodes): - """Marks a set of nodes returned by "get_ready" as processed. - - This method unblocks any successor of each node in *nodes* for being returned - in the future by a a call to "get_ready" - - Raises :exec:`ValueError` if any node in *nodes* has already been marked as - processed by a previous call to this method, if a node was not added to the - graph by using "add" or if called without calling "prepare" previously or if - node has not yet been returned by "get_ready". - - """ - if self._ready_nodes is None: - raise ValueError("prepare() must be called first") - - n2i = self._node2info - - for node in nodes: - - # Check if we know about this node (it was added previously using add() - nodeinfo = n2i.get(node) - if nodeinfo is None: - raise ValueError(f"node {node!r} was not added using add()") - - # If the node has not being returned (marked as ready) previously, inform - # the user. - stat = nodeinfo.npredecessors - if stat != _NODE_OUT: - if stat >= 0: - raise ValueError( - f"node {node!r} was not passed out (still not ready)" - ) - elif stat == _NODE_DONE: - raise ValueError(f"node {node!r} was already marked done") - else: - raise ValueError(f"node {node!r}: unknown status {stat}") - - # Mark the node as processed - nodeinfo.npredecessors = _NODE_DONE - - # Go to all the successors and reduce the number of predecessors, collecting - # all the ones that are ready to be returned in the next get_ready() call. - for successor in nodeinfo.successors: - successor_info = n2i[successor] - successor_info.npredecessors -= 1 - if successor_info.npredecessors == 0: - self._ready_nodes.append(successor) - self._nfinished += 1 - - def _find_cycle(self): - n2i = self._node2info - stack = [] - itstack = [] - seen = set() - node2stacki = {} - - for node in n2i: - if node in seen: - continue - - while True: - if node in seen: - # If we have seen already the node and is in the - # current stack we have found a cycle. - if node in node2stacki: - return stack[node2stacki[node] :] + [node] - # else go on to get next successor - else: - seen.add(node) - itstack.append(iter(n2i[node].successors).__next__) - node2stacki[node] = len(stack) - stack.append(node) - - # Backtrack to the topmost stack entry with - # at least another successor. - while stack: - try: - node = itstack[-1]() - break - except StopIteration: - del node2stacki[stack.pop()] - itstack.pop() - else: - break - return None - - def static_order(self): - """Returns an iterable of nodes in a topological order. - - The particular order that is returned may depend on the specific order in which - the items were inserted in the graph. - - Using this method does not require to call "prepare" or "done". If any cycle is - detected, :exc:`CycleError` will be raised. - - """ - self.prepare() - while self.is_active(): - node_group = self.get_ready() - yield from node_group - self.done(*node_group) diff --git a/tests/test_execute.py b/tests/test_execute.py index a5d6ff4..493c73f 100644 --- a/tests/test_execute.py +++ b/tests/test_execute.py @@ -2,6 +2,7 @@ import textwrap from time import time +import attr import pytest from pytask import cli from pytask import main @@ -10,9 +11,9 @@ from pytask_parallel.execute import ProcessesNameSpace +@attr.s class DummyTask: - def __init__(self, function): - self.function = function + function = attr.ib() def execute(self): self.function() @@ -170,10 +171,14 @@ def task_2(produces): def test_stop_execution_when_max_failures_is_reached(tmp_path, parallel_backend): source = """ import time + import pytask - def task_3(): raise NotImplmentedError + def task_1(): time.sleep(1) def task_2(): time.sleep(2); raise NotImplementedError - def task_1(): pass + + @pytask.mark.try_last + def task_3(): + time.sleep(3) """ tmp_path.joinpath("task_dummy.py").write_text(textwrap.dedent(source)) @@ -188,3 +193,45 @@ def task_1(): pass assert len(session.tasks) == 3 assert len(session.execution_reports) == 2 + + +@pytest.mark.end_to_end +@pytest.mark.parametrize("parallel_backend", PARALLEL_BACKENDS) +def test_task_priorities(tmp_path, parallel_backend): + source = """ + import pytask + import time + + @pytask.mark.try_first + def task_0(): + time.sleep(1) + + def task_1(): + time.sleep(1) + + @pytask.mark.try_last + def task_2(): + time.sleep(1) + + @pytask.mark.try_first + def task_3(): + time.sleep(1) + + def task_4(): + time.sleep(1) + + @pytask.mark.try_last + def task_5(): + time.sleep(1) + """ + tmp_path.joinpath("task_dummy.py").write_text(textwrap.dedent(source)) + + session = main( + {"paths": tmp_path, "parallel_backend": parallel_backend, "n_workers": 2} + ) + + assert session.exit_code == 0 + first_task_name = session.execution_reports[0].task.name + assert first_task_name.endswith("task_0") or first_task_name.endswith("task_3") + last_task_name = session.execution_reports[-1].task.name + assert last_task_name.endswith("task_2") or last_task_name.endswith("task_5")