diff --git a/docs/source/changes.md b/docs/source/changes.md index 6b13c8ec..375d7a6e 100644 --- a/docs/source/changes.md +++ b/docs/source/changes.md @@ -5,9 +5,9 @@ chronological order. Releases follow [semantic versioning](https://semver.org/) releases are available on [PyPI](https://pypi.org/project/pytask) and [Anaconda.org](https://anaconda.org/conda-forge/pytask). -## 0.5.1 - 2024-xx-xx +## 0.5.1 - 2024-07-19 -- {pull}`616` redesigns the guide on "Scaling Tasks". +- {pull}`616` and {pull}`632` redesign the guide on "Scaling Tasks". - {pull}`617` fixes an interaction with provisional nodes and `@mark.persist`. - {pull}`618` ensures that `root_dir` of `DirectoryNode` is created before the task is executed. diff --git a/docs/source/how_to_guides/bp_complex_task_repetitions.md b/docs/source/how_to_guides/bp_complex_task_repetitions.md index 68e44569..46c0ff3b 100644 --- a/docs/source/how_to_guides/bp_complex_task_repetitions.md +++ b/docs/source/how_to_guides/bp_complex_task_repetitions.md @@ -32,16 +32,23 @@ are growing over time and you run into these problems. ## Solution The main idea for the solution is quickly explained. We will, first, formalize -dimensions into objects and, secondly, combine them in one object such that we only have -to iterate over instances of this object in a single loop. - -We will start by defining the dimensions using {class}`~typing.NamedTuple` or +dimensions into objects using {class}`~typing.NamedTuple` or {func}`~dataclasses.dataclass`. -Then, we will define the object that holds both pieces of information together and for -the lack of a better name, we will call it an experiment. +Secondly, we will combine dimensions in multi-dimensional objects such that we only have +to iterate over instances of this object in a single loop. Here and for the lack of a +better name, we will call the object an experiment. + +Lastly, we will also use the {class}`~pytask.DataCatalog` to not be bothered with +defining paths. -```{literalinclude} ../../../docs_src/how_to_guides/bp_complex_task_repetitions/experiment.py +```{seealso} +If you have not learned about the {class}`~pytask.DataCatalog` yet, start with the +{doc}`tutorial <../tutorials/using_a_data_catalog>` and continue with the +{doc}`how-to guide `. +``` + +```{literalinclude} ../../../docs_src/how_to_guides/bp_complex_task_repetitions/config.py --- caption: config.py --- @@ -49,10 +56,11 @@ caption: config.py There are some things to be said. -- The names on each dimension need to be unique and ensure that by combining them for - the name of the experiment, we get a unique and descriptive id. -- Dimensions might need more attributes than just a name, like paths, or other arguments - for the task. Add them. +- The `.name` attributes on each dimension need to return unique names and to ensure + that by combining them for the name of the experiment, we get a unique and descriptive + id. +- Dimensions might need more attributes than just a name, like paths, keys for the data + catalog, or other arguments for the task. Next, we will use these newly defined data structures and see how our tasks change when we use them. @@ -63,21 +71,55 @@ caption: task_example.py --- ``` -As you see, we replaced +As you see, we lost a level of indentation and we moved all the generations of names and +paths to the dimensions and multi-dimensional objects. -## Using the `DataCatalog` +## Adding another level -## Adding another dimension +Extending a dimension by another level is usually quickly done. For example, if we have +another model that we want to fit to the data, we extend `MODELS` which will +automatically lead to all downstream tasks being created. -## Adding another level +```{code-block} python +--- +caption: config.py +--- +... +MODELS = [Model("ols"), Model("logit"), Model("linear_prob"), Model("new_model")] +... +``` + +Of course, you might need to alter `task_fit_model` because the task needs to handle the +new model as well as the others. Here is where it pays off if you are using high-level +interfaces in your code that handle all of the models with a simple +`fitted_model = fit_model(data=data, model_name=model_name)` call and also return fitted +models that are similar objects. ## Executing a subset -## Grouping and aggregating +What if you want to execute a subset of tasks, for example, all tasks related to a model +or a dataset? + +When you are using the `.name` attributes of the dimensions and multi-dimensional +objects like in the example above, you ensure that the names of dimensions are included +in all downstream tasks. + +Thus, you can simply call pytask with the following expression to execute all tasks +related to the logit model. + +```console +pytask -k logit +``` + +```{seealso} +Expressions and markers for selecting tasks are explained in +{doc}`../tutorials/selecting_tasks`. +``` ## Extending repetitions -Some parametrized tasks are costly to run - costly in terms of computing power, memory, -or time. Users often extend repetitions triggering all repetitions to be rerun. Thus, -use the {func}`@pytask.mark.persist ` decorator, which is explained -in more detail in this {doc}`tutorial <../tutorials/making_tasks_persist>`. +Some repeated tasks are costly to run - costly in terms of computing power, memory, or +runtime. If you change a task module, you might accidentally trigger all other tasks in +the module to be rerun. Use the {func}`@pytask.mark.persist ` +decorator, which is explained in more detail in this +{doc}`tutorial <../tutorials/making_tasks_persist>`. diff --git a/docs_src/how_to_guides/bp_complex_task_repetitions/config.py b/docs_src/how_to_guides/bp_complex_task_repetitions/config.py new file mode 100644 index 00000000..f22041ff --- /dev/null +++ b/docs_src/how_to_guides/bp_complex_task_repetitions/config.py @@ -0,0 +1,41 @@ +from pathlib import Path +from typing import NamedTuple + +from pytask import DataCatalog + +SRC = Path(__file__).parent +BLD = SRC / "bld" + +data_catalog = DataCatalog() + + +class Dataset(NamedTuple): + name: str + + @property + def path(self) -> Path: + return SRC / f"{self.name}.pkl" + + +class Model(NamedTuple): + name: str + + +DATASETS = [Dataset("a"), Dataset("b"), Dataset("c")] +MODELS = [Model("ols"), Model("logit"), Model("linear_prob")] + + +class Experiment(NamedTuple): + dataset: Dataset + model: Model + + @property + def name(self) -> str: + return f"{self.model.name}-{self.dataset.name}" + + @property + def fitted_model_name(self) -> str: + return f"{self.name}-fitted-model" + + +EXPERIMENTS = [Experiment(dataset, model) for dataset in DATASETS for model in MODELS] diff --git a/docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py b/docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py index 741d2c19..930b9658 100644 --- a/docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py +++ b/docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py @@ -1,8 +1,8 @@ -from pathlib import Path from typing import Annotated +from typing import Any from myproject.config import EXPERIMENTS -from pytask import Product +from myproject.config import data_catalog from pytask import task for experiment in EXPERIMENTS: @@ -10,5 +10,4 @@ @task(id=experiment.name) def task_fit_model( path_to_data: experiment.dataset.path, - path_to_model: Annotated[Path, Product] = experiment.path, - ) -> None: ... + ) -> Annotated[Any, data_catalog[experiment.fitted_model_name]]: ... diff --git a/pyproject.toml b/pyproject.toml index 3104f2a7..025f574c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ test = [ "aiohttp", # For HTTPPath tests. "coiled", ] -typing = ["mypy>=1.9.0", "nbqa[mypy]>=1.8.5"] +typing = ["mypy>=1.9.0,<1.11", "nbqa[mypy]>=1.8.5"] [project.urls] Changelog = "https://pytask-dev.readthedocs.io/en/stable/changes.html" @@ -186,6 +186,7 @@ disallow_untyped_defs = true no_implicit_optional = true warn_redundant_casts = true warn_unused_ignores = true +disable_error_code = ["import-untyped"] [[tool.mypy.overrides]] module = "tests.*"