Skip to content

Add support for dask. #86

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ releases are available on [PyPI](https://pypi.org/project/pytask-parallel) and
## 0.5.0 - 2024-xx-xx

- {pull}`85` simplifies code since loky is a dependency.
- {pull}`86` adds support for dask.
- {pull}`88` updates handling `Traceback`.
- {pull}`89` restructures the package.
- {pull}`92` redirects stdout and stderr from processes and loky and shows them in error
Expand Down
6 changes: 6 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ dependencies:
- loky
- optree

# Additional dependencies
- universal_pathlib <0.2
- s3fs>=2023.4.0
- coiled
- distributed

# Misc
- tox
- ipywidgets
Expand Down
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ name = "Tobias Raabe"
email = "[email protected]"

[project.optional-dependencies]
dask = ["dask[complete]", "distributed"]
test = [
"nbmake",
"pytest",
"pytest-cov",
"pytask-parallel[all]",
"nbmake",
"pytest",
"pytest-cov",
]

[project.readme]
Expand Down Expand Up @@ -112,6 +114,7 @@ force-single-line = true
convention = "numpy"

[tool.pytest.ini_options]
addopts = ["--nbmake"]
# Do not add src since it messes with the loading of pytask-parallel as a plugin.
testpaths = ["tests"]
markers = [
Expand Down
59 changes: 48 additions & 11 deletions src/pytask_parallel/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import warnings
from concurrent.futures import Executor
from concurrent.futures import Future
from concurrent.futures import ProcessPoolExecutor
Expand All @@ -12,6 +13,7 @@
from typing import ClassVar

import cloudpickle
from attrs import define
from loky import get_reusable_executor

__all__ = ["ParallelBackend", "ParallelBackendRegistry", "registry"]
Expand All @@ -27,7 +29,7 @@ def _deserialize_and_run_with_cloudpickle(fn: bytes, kwargs: bytes) -> Any:
class _CloudpickleProcessPoolExecutor(ProcessPoolExecutor):
"""Patches the standard executor to serialize functions with cloudpickle."""

# The type signature is wrong for version above Py3.7. Fix when 3.7 is deprecated.
# The type signature is wrong for Python >3.8. Fix when support is dropped.
def submit( # type: ignore[override]
self,
fn: Callable[..., Any],
Expand All @@ -42,15 +44,54 @@ def submit( # type: ignore[override]
)


def _get_dask_executor(n_workers: int) -> Executor:
"""Get an executor from a dask client."""
_rich_traceback_omit = True
from pytask import import_optional_dependency

distributed = import_optional_dependency("distributed")
try:
client = distributed.Client.current()
except ValueError:
client = distributed.Client(distributed.LocalCluster(n_workers=n_workers))
else:
if client.cluster and len(client.cluster.workers) != n_workers:
warnings.warn(
"The number of workers in the dask cluster "
f"({len(client.cluster.workers)}) does not match the number of workers "
f"requested ({n_workers}). The requested number of workers will be "
"ignored.",
stacklevel=1,
)
return client.get_executor()


def _get_loky_executor(n_workers: int) -> Executor:
"""Get a loky executor."""
return get_reusable_executor(max_workers=n_workers)


def _get_process_pool_executor(n_workers: int) -> Executor:
"""Get a process pool executor."""
return _CloudpickleProcessPoolExecutor(max_workers=n_workers)


def _get_thread_pool_executor(n_workers: int) -> Executor:
"""Get a thread pool executor."""
return ThreadPoolExecutor(max_workers=n_workers)


class ParallelBackend(Enum):
"""Choices for parallel backends."""

CUSTOM = "custom"
DASK = "dask"
LOKY = "loky"
PROCESSES = "processes"
THREADS = "threads"


@define
class ParallelBackendRegistry:
"""Registry for parallel backends."""

Expand All @@ -68,23 +109,19 @@ def get_parallel_backend(self, kind: ParallelBackend, n_workers: int) -> Executo
try:
return self.registry[kind](n_workers=n_workers)
except KeyError:
msg = f"No registered parallel backend found for kind {kind}."
msg = f"No registered parallel backend found for kind {kind.value!r}."
raise ValueError(msg) from None
except Exception as e: # noqa: BLE001
msg = f"Could not instantiate parallel backend {kind.value}."
msg = f"Could not instantiate parallel backend {kind.value!r}."
raise ValueError(msg) from e


registry = ParallelBackendRegistry()


registry.register_parallel_backend(ParallelBackend.DASK, _get_dask_executor)
registry.register_parallel_backend(ParallelBackend.LOKY, _get_loky_executor)
registry.register_parallel_backend(
ParallelBackend.PROCESSES,
lambda n_workers: _CloudpickleProcessPoolExecutor(max_workers=n_workers),
)
registry.register_parallel_backend(
ParallelBackend.THREADS, lambda n_workers: ThreadPoolExecutor(max_workers=n_workers)
)
registry.register_parallel_backend(
ParallelBackend.LOKY, lambda n_workers: get_reusable_executor(max_workers=n_workers)
ParallelBackend.PROCESSES, _get_process_pool_executor
)
registry.register_parallel_backend(ParallelBackend.THREADS, _get_thread_pool_executor)
25 changes: 14 additions & 11 deletions src/pytask_parallel/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from pytask import hookimpl

from pytask_parallel import custom
from pytask_parallel import dask
from pytask_parallel import execute
from pytask_parallel import logging
from pytask_parallel import processes
from pytask_parallel import threads
from pytask_parallel.backends import ParallelBackend
Expand Down Expand Up @@ -37,19 +39,20 @@ def pytask_parse_config(config: dict[str, Any]) -> None:
@hookimpl(trylast=True)
def pytask_post_parse(config: dict[str, Any]) -> None:
"""Register the parallel backend if debugging is not enabled."""
# Deactivate parallel execution if debugger, trace or dry-run is used.
if config["pdb"] or config["trace"] or config["dry_run"]:
config["n_workers"] = 1
return

# Register parallel execute hook.
if config["n_workers"] > 1 or config["parallel_backend"] == ParallelBackend.CUSTOM:
config["pm"].register(execute)
# Register parallel execute and logging hook.
config["pm"].register(logging)
config["pm"].register(execute)

# Register parallel backends.
if config["n_workers"] > 1:
if config["parallel_backend"] == ParallelBackend.THREADS:
config["pm"].register(threads)
else:
config["pm"].register(processes)

if config["parallel_backend"] == ParallelBackend.CUSTOM:
if config["parallel_backend"] == ParallelBackend.THREADS:
config["pm"].register(threads)
elif config["parallel_backend"] == ParallelBackend.DASK:
config["pm"].register(dask)
elif config["parallel_backend"] == ParallelBackend.CUSTOM:
config["pm"].register(custom)
else:
config["pm"].register(processes)
Loading