diff --git a/.gitignore b/.gitignore index 01b8ee7e..d23644a4 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ tests/test_jupyter/*.txt .pytest_cache .ruff_cache .venv +docs/jupyter_execute diff --git a/docs/source/changes.md b/docs/source/changes.md index fc439ba7..b29d9f55 100644 --- a/docs/source/changes.md +++ b/docs/source/changes.md @@ -47,6 +47,7 @@ releases are available on [PyPI](https://pypi.org/project/pytask) and - {pull}`603` fixes an example in the documentation about capturing warnings. - {pull}`604` fixes some examples with `PythonNode`s in the documentation. - {pull}`605` improves checks and CI. +- {pull}`606` improves the documentation for data catalogs. - {pull}`609` allows a pending status for tasks. Useful for async backends implemented in pytask-parallel. - {pull}`611` removes the initial task execution status from diff --git a/docs/source/conf.py b/docs/source/conf.py index 84b0d6be..22848175 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -51,8 +51,7 @@ "sphinx_copybutton", "sphinx_click", "sphinx_toolbox.more_autodoc.autoprotocol", - "nbsphinx", - "myst_parser", + "myst_nb", "sphinx_design", ] diff --git a/docs/source/how_to_guides/bp_scaling_tasks.md b/docs/source/how_to_guides/bp_scaling_tasks.md index 4342d6ef..fa7cb5e9 100644 --- a/docs/source/how_to_guides/bp_scaling_tasks.md +++ b/docs/source/how_to_guides/bp_scaling_tasks.md @@ -39,9 +39,6 @@ my_project │ ├────config.py │ └────task_estimate_models.py │ -│ -├───setup.py -│ ├───.pytask │ └────... │ diff --git a/docs/source/how_to_guides/the_data_catalog.md b/docs/source/how_to_guides/the_data_catalog.md index 6d8b825b..74e3db7c 100644 --- a/docs/source/how_to_guides/the_data_catalog.md +++ b/docs/source/how_to_guides/the_data_catalog.md @@ -1,9 +1,8 @@ # The `DataCatalog` - Revisited -An introduction to the data catalog can be found in the -[tutorial](../tutorials/using_a_data_catalog.md). - -This guide explains some details that were left out of the tutorial. +This guide explains more details about the {class}`~pytask.DataCatalog` that were left +out of the [tutorial](../tutorials/using_a_data_catalog.md). Please, read the tutorial +for a basic understanding. ## Changing the default node @@ -15,23 +14,24 @@ For example, use the {class}`~pytask.PythonNode` as the default. ```python from pytask import PythonNode +from pytask import DataCatalog data_catalog = DataCatalog(default_node=PythonNode) ``` -Or, learn to write your own node by reading {doc}`writing_custom_nodes`. +Or, learn to write your node by reading {doc}`writing_custom_nodes`. -Here, is an example for a `PickleNode` that uses cloudpickle instead of the normal -`pickle` module. +Here, is an example for a {class}`~pytask.PickleNode` that uses cloudpickle instead of +the normal {mod}`pickle` module. ```{literalinclude} ../../../docs_src/how_to_guides/the_data_catalog.py ``` ## Changing the name and the default path -By default, the data catalogs store their data in a directory `.pytask/data_catalogs`. -If you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the +By default, data catalogs store their data in a directory `.pytask/data_catalogs`. If +you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the `.pytask` folder is in the same folder as the configuration file. The default name for a catalog is `"default"` and so you will find its data in @@ -39,15 +39,23 @@ The default name for a catalog is `"default"` and so you will find its data in `"data_management"`, you will find the data in `.pytask/data_catalogs/data_management`. ```python +from pytask import DataCatalog + + data_catalog = DataCatalog(name="data_management") ``` +```{note} +The name of a data catalog is restricted to letters, numbers, hyphens and underscores. +``` + You can also change the path where the data catalogs will be stored by changing the `path` attribute. Here, we store the data catalog's data next to the module where the data catalog is defined in `.data`. ```python from pathlib import Path +from pytask import DataCatalog data_catalog = DataCatalog(path=Path(__file__).parent / ".data") @@ -55,14 +63,15 @@ data_catalog = DataCatalog(path=Path(__file__).parent / ".data") ## Multiple data catalogs -You can use multiple data catalogs when you want to separate your datasets across -multiple catalogs or when you want to use the same names multiple times (although it is -not recommended!). +You can use multiple data catalogs when you want to separate your datasets or to avoid +name collisions of data catalog entries. Make sure you assign different names to the data catalogs so that their data is stored in different directories. ```python +from pytask import DataCatalog + # Stored in .pytask/data_catalog/a data_catalog_a = DataCatalog(name="a") @@ -71,3 +80,53 @@ data_catalog_b = DataCatalog(name="b") ``` Or, use different paths as explained above. + +## Nested data catalogs + +Name collisions can also occur when you are using multiple levels of repetitions, for +example, when you are fitting multiple models to multiple data sets. + +You can structure your data catalogs like this. + +```python +from pytask import DataCatalog + + +MODEL_NAMES = ("ols", "logistic_regression") +DATA_NAMES = ("data_1", "data_2") + + +nested_data_catalogs = { + model_name: { + data_name: DataCatalog(name=f"{model_name}-{data_name}") + for data_name in DATA_NAMES + } + for model_name in MODEL_NAMES +} +``` + +The task could look like this. + +```python +from pathlib import Path +from pytask import task +from typing_extensions import Annotated + +from my_project.config import DATA_NAMES +from my_project.config import MODEL_NAMES +from my_project.config import nested_data_catalogs + + +for model_name in MODEL_NAMES: + for data_name in DATA_NAMES: + + @task + def fit_model( + path: Path = Path("...", data_name) + ) -> Annotated[ + Any, nested_data_catalogs[model_name][data_name]["fitted_model"] + ]: + data = ... + fitted_model = ... + return fitted_model +``` diff --git a/docs/source/reference_guides/api.md b/docs/source/reference_guides/api.md index 4ec74a8e..56204c28 100644 --- a/docs/source/reference_guides/api.md +++ b/docs/source/reference_guides/api.md @@ -228,7 +228,9 @@ Task are currently represented by the following classes: ```{eval-rst} .. autoclass:: pytask.Task + :members: .. autoclass:: pytask.TaskWithoutPath + :members: ``` Currently, there are no different types of tasks since changing the `.function` @@ -325,6 +327,9 @@ resolution and execution. An indicator to mark arguments of tasks as products. + >>> from pathlib import Path + >>> from pytask import Product + >>> from typing_extensions import Annotated >>> def task_example(path: Annotated[Path, Product]) -> None: ... path.write_text("Hello, World!") diff --git a/docs/source/tutorials/using_a_data_catalog.md b/docs/source/tutorials/using_a_data_catalog.md index d704421d..d24d64d0 100644 --- a/docs/source/tutorials/using_a_data_catalog.md +++ b/docs/source/tutorials/using_a_data_catalog.md @@ -10,14 +10,14 @@ Two things will quickly become a nuisance in bigger projects. they are just intermediate representations. As a solution, pytask offers a {class}`~pytask.DataCatalog` which is a purely optional -feature. The tutorial focuses on the main features. To learn about all features, read -the [how-to guide](../how_to_guides/the_data_catalog.md). +feature. The tutorial focuses on the main features. To learn about all the features, +read the [how-to guide](../how_to_guides/the_data_catalog.md). Let us focus on the previous example and see how the {class}`~pytask.DataCatalog` helps us. -The project structure is the same as in the previous example with the exception of the -`.pytask` folder and the missing `data.pkl` in `bld`. +The project structure is the same as in the previous example except the `.pytask` folder +and the missing `data.pkl` in `bld`. ```text my_project @@ -44,15 +44,51 @@ At first, we define the data catalog in `config.py`. ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_1.py ``` -## `task_data_preparation` +## `task_create_random_data` -Next, we will use the data catalog to save the product of the task in -`task_data_preparation.py`. +Next, we look at the module `task_data_preparation.py` and its task +`task_create_random_data`. The task creates a dataframe with simulated data that should +be stored on the disk. -Instead of using a path, we set the location of the product in the data catalog with -`data_catalog["data"]`. If the key does not exist, the data catalog will automatically -create a {class}`~pytask.PickleNode` that allows you to save any Python object to a -`pickle` file. The `pickle` file is stored within the `.pytask` folder. +In the previous tutorial, we learned to use {class}`~pathlib.Path`s to define products +of our tasks. Here we see again the signature of the task function. + +`````{tab-set} + +````{tab-item} Python 3.10+ +:sync: python310plus + +```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_py310.py +:lines: 10-12 +``` +```` + +````{tab-item} Python 3.8+ +:sync: python38plus + +```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_py38.py +:lines: 10-12 +``` +```` + +````{tab-item} produces +:sync: produces + +```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_produces.py +:lines: 8 +``` +```` +````` + +When we want to use the data catalog, we replace `BLD / "data.pkl"` with an entry of the +data catalog like `data_catalog["data"]`. If there is yet no entry with the name +`"data"`, the data catalog will automatically create a {class}`~pytask.PickleNode`. The +node allows you to save any Python object to a `pickle` file. + +You probably noticed that we did not need to define a path. That is because the data +catalog takes care of that and stores the `pickle` file in the `.pytask` folder. + +Using `data_catalog["data"]` is thus equivalent to using `PickleNode(path=Path(...))`. The following tabs show you how to use the data catalog given the interface you prefer. @@ -125,10 +161,6 @@ Following one of the interfaces gives you immediate access to the ````{tab-item} Python 3.10+ :sync: python310plus -Use `data_catalog["data"]` as an default argument to access the -{class}`~pytask.PickleNode` within the task. When you are done transforming your -{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. - ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_3_py310.py :emphasize-lines: 12 ``` @@ -138,10 +170,6 @@ Use `data_catalog["data"]` as an default argument to access the ````{tab-item} Python 3.8+ :sync: python38plus -Use `data_catalog["data"]` as an default argument to access the -{class}`~pytask.PickleNode` within the task. When you are done transforming your -{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. - ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_3_py38.py :emphasize-lines: 12 ``` @@ -160,7 +188,8 @@ In most projects, you have other data sets that you would like to access via the catalog. To add them, call the {meth}`~pytask.DataCatalog.add` method and supply a name and a path. -Let's add `file.csv` to the data catalog. +Let's add `file.csv` with the name `"csv"` to the data catalog and use it to create +`data["transformed_csv"]`. ```text my_project @@ -174,8 +203,6 @@ my_project │ ├────task_data_preparation.py │ └────task_plot_data.py │ -├───setup.py -│ ├───.pytask │ └────... │ @@ -184,13 +211,24 @@ my_project └────plot.png ``` -The path can be absolute or relative to the module of the data catalog. +We can use a relative or an absolute path to define the location of the file. A relative +path means the location is relative to the module of the data catalog. ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_4.py ``` -You can now use the data catalog as in previous example and use the -{class}`~~pathlib.Path` in the task. +You can now use the data catalog as in the previous example and use the +{class}`~pathlib.Path` in the task. + +```{note} +Note that the value of `data_catalog["csv"]` inside the task becomes a +{class}`~pathlib.Path`. It is because a {class}`~pathlib.Path` in +{meth}`~pytask.DataCatalog.add` is not parsed to a {class}`~pytask.PickleNode` but a +{class}`~pytask.PathNode`. + +Read {doc}`../how_to_guides/writing_custom_nodes` for more information about +different node types which is not relevant now. +``` `````{tab-set} @@ -224,9 +262,14 @@ You can now use the data catalog as in previous example and use the ## Developing with the `DataCatalog` -You can also use the data catalog in a Jupyter notebook or in the terminal in the Python -interpreter. Simply import the data catalog, select a node and call the -{meth}`~pytask.PNode.load` method of a node to access its value. +You can also use the data catalog in a Jupyter Notebook or the terminal in the Python +interpreter. This can be super helpful when you develop tasks interactively in a Jupyter +Notebook. + +Simply import the data catalog, select a node and call the {meth}`~pytask.PNode.load` +method of a node to access its value. + +Here is an example with a terminal. ```pycon >>> from myproject.config import data_catalog diff --git a/docs_src/tutorials/using_a_data_catalog_4.py b/docs_src/tutorials/using_a_data_catalog_4.py index 125bbdf5..fd49e88d 100644 --- a/docs_src/tutorials/using_a_data_catalog_4.py +++ b/docs_src/tutorials/using_a_data_catalog_4.py @@ -10,4 +10,3 @@ # Use either a relative or a absolute path. data_catalog.add("csv", Path("file.csv")) -data_catalog.add("transformed_csv", BLD / "file.pkl") diff --git a/pyproject.toml b/pyproject.toml index f864f715..8a11a25c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ docs = [ "ipython", "matplotlib", "myst-parser", - "nbsphinx", + "myst-nb", "sphinx", "sphinx-click", "sphinx-copybutton", @@ -92,6 +92,10 @@ build-backend = "hatchling.build" managed = true dev-dependencies = ["tox-uv>=1.7.0"] +[tool.rye.scripts] +clean-docs = { cmd = "rm -rf docs/build" } +build-docs = { cmd = "sphinx-build -b html docs/source docs/build" } + [tool.hatch.build.hooks.vcs] version-file = "src/_pytask/_version.py" diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index d2993e60..8a9a08cd 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -48,12 +48,10 @@ class DataCatalog: A default node for loading and saving values. By default, :class:`~pytask.PickleNode` is used to serialize any Python object with the :mod:`pickle` module. - entries - A collection of entries in the catalog. Entries can be :class:`~pytask.PNode` or - a :class:`DataCatalog` itself for nesting catalogs. name - The name of the data catalog. Use it when you are working with multiple data - catalogs that store data under the same keys. + The name of the data catalog which can only contain letters, numbers, hyphens + and underscores. Use it when you are working with multiple data catalogs to + store data in different locations. path A path where automatically created files are stored. By default, it will be ``.pytask/data_catalogs/default``. @@ -61,13 +59,13 @@ class DataCatalog: """ default_node: type[PNode] = PickleNode - entries: dict[str, PNode | PProvisionalNode] = field(factory=dict) name: str = field(default="default") path: Path | None = None + _entries: dict[str, PNode | PProvisionalNode] = field(factory=dict) + _instance_path: Path = field(factory=_get_parent_path_of_data_catalog_module) _session_config: dict[str, Any] = field( factory=lambda *x: {"check_casing_of_paths": True} # noqa: ARG005 ) - _instance_path: Path = field(factory=_get_parent_path_of_data_catalog_module) @name.validator def _check(self, attribute: str, value: str) -> None: # noqa: ARG002 @@ -91,24 +89,19 @@ def __attrs_post_init__(self) -> None: self.path.mkdir(parents=True, exist_ok=True) - self._initialize() - - def _initialize(self) -> None: - """Initialize the data catalog with persisted nodes from previous runs.""" - for path in self.path.glob("*-node.pkl"): # type: ignore[union-attr] + # Initialize the data catalog with persisted nodes from previous runs. + for path in self.path.glob("*-node.pkl"): node = pickle.loads(path.read_bytes()) # noqa: S301 - self.entries[node.name] = node + self._entries[node.name] = node def __getitem__(self, name: str) -> PNode | PProvisionalNode: """Allow to access entries with the squared brackets syntax.""" - if name not in self.entries: + if name not in self._entries: self.add(name) - return self.entries[name] + return self._entries[name] - def add(self, name: str, node: PNode | PProvisionalNode | None = None) -> None: + def add(self, name: str, node: PNode | PProvisionalNode | Any = None) -> None: """Add an entry to the data catalog.""" - assert isinstance(self.path, Path) - if not isinstance(name, str): msg = "The name of a catalog entry must be a string." raise TypeError(msg) @@ -116,16 +109,16 @@ def add(self, name: str, node: PNode | PProvisionalNode | None = None) -> None: if node is None: filename = hashlib.sha256(name.encode()).hexdigest() if isinstance(self.default_node, PPathNode): - self.entries[name] = self.default_node( + self._entries[name] = self.default_node( name=name, path=self.path / f"{filename}.pkl" ) else: - self.entries[name] = self.default_node(name=name) # type: ignore[call-arg] - self.path.joinpath(f"{filename}-node.pkl").write_bytes( - pickle.dumps(self.entries[name]) + self._entries[name] = self.default_node(name=name) # type: ignore[call-arg] + self.path.joinpath(f"{filename}-node.pkl").write_bytes( # type: ignore[union-attr] + pickle.dumps(self._entries[name]) ) elif isinstance(node, (PNode, PProvisionalNode)): - self.entries[name] = node + self._entries[name] = node else: # Acquire the latest pluginmanager. session = Session(config=self._session_config, hook=storage.get().hook) @@ -139,4 +132,4 @@ def add(self, name: str, node: PNode | PProvisionalNode | None = None) -> None: if collected_node is None: # pragma: no cover msg = f"{node!r} cannot be parsed." raise NodeNotCollectedError(msg) - self.entries[name] = collected_node + self._entries[name] = collected_node diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index 24b4d207..4c678d9d 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -226,6 +226,7 @@ class PythonNode(PNode): own hashing function. For example, from the :mod:`deepdiff` library. >>> from deepdiff import DeepHash + >>> from pytask import PythonNode >>> node = PythonNode(name="node", value={"a": 1}, hash=lambda x: DeepHash(x)[x]) .. warning:: Hashing big objects can require some time. diff --git a/src/_pytask/outcomes.py b/src/_pytask/outcomes.py index 524f0975..f39151b6 100644 --- a/src/_pytask/outcomes.py +++ b/src/_pytask/outcomes.py @@ -174,7 +174,7 @@ def count_outcomes( Examples -------- - >>> from _pytask.outcomes import CollectionOutcome, TaskOutcome + >>> from _pytask.outcomes import CollectionOutcome, TaskOutcome, count_outcomes >>> count_outcomes([], CollectionOutcome) {: 0, : 0} diff --git a/tox.ini b/tox.ini index 295636f5..22051f4f 100644 --- a/tox.ini +++ b/tox.ini @@ -9,7 +9,7 @@ package = editable [testenv:docs] extras = docs, test commands = - - sphinx-build -n -T -b html -d {envtmpdir}/doctrees docs/source docs/build/html + sphinx-build -n -T -b html -d {envtmpdir}/doctrees docs/source docs/build/html - sphinx-build -n -T -b doctest -d {envtmpdir}/doctrees docs/source docs/build/html [testenv:typing]