From 1d966b40dca71bf1e97212e241769e80df487687 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 24 Apr 2024 23:19:18 +0200 Subject: [PATCH 1/9] Improve documentation on data catalogs. --- .gitignore | 1 + docs/source/conf.py | 3 +- docs/source/how_to_guides/bp_scaling_tasks.md | 3 - docs/source/how_to_guides/the_data_catalog.md | 17 ++-- docs/source/tutorials/using_a_data_catalog.md | 99 +++++++++++++------ docs_src/tutorials/using_a_data_catalog_4.py | 1 - pyproject.toml | 6 +- src/_pytask/data_catalog.py | 6 +- 8 files changed, 88 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index 01b8ee7e..d23644a4 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ tests/test_jupyter/*.txt .pytest_cache .ruff_cache .venv +docs/jupyter_execute diff --git a/docs/source/conf.py b/docs/source/conf.py index 84b0d6be..22848175 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -51,8 +51,7 @@ "sphinx_copybutton", "sphinx_click", "sphinx_toolbox.more_autodoc.autoprotocol", - "nbsphinx", - "myst_parser", + "myst_nb", "sphinx_design", ] diff --git a/docs/source/how_to_guides/bp_scaling_tasks.md b/docs/source/how_to_guides/bp_scaling_tasks.md index 4342d6ef..fa7cb5e9 100644 --- a/docs/source/how_to_guides/bp_scaling_tasks.md +++ b/docs/source/how_to_guides/bp_scaling_tasks.md @@ -39,9 +39,6 @@ my_project │ ├────config.py │ └────task_estimate_models.py │ -│ -├───setup.py -│ ├───.pytask │ └────... │ diff --git a/docs/source/how_to_guides/the_data_catalog.md b/docs/source/how_to_guides/the_data_catalog.md index 6d8b825b..f7f592ca 100644 --- a/docs/source/how_to_guides/the_data_catalog.md +++ b/docs/source/how_to_guides/the_data_catalog.md @@ -1,9 +1,8 @@ # The `DataCatalog` - Revisited -An introduction to the data catalog can be found in the -[tutorial](../tutorials/using_a_data_catalog.md). - -This guide explains some details that were left out of the tutorial. +This guide explains more details about the {class}`~pytask.DataCatalog` that were left +out of the [tutorial](../tutorials/using_a_data_catalog.md). Please, read the tutorial +for a basic understanding. ## Changing the default node @@ -20,18 +19,18 @@ from pytask import PythonNode data_catalog = DataCatalog(default_node=PythonNode) ``` -Or, learn to write your own node by reading {doc}`writing_custom_nodes`. +Or, learn to write your node by reading {doc}`writing_custom_nodes`. -Here, is an example for a `PickleNode` that uses cloudpickle instead of the normal -`pickle` module. +Here, is an example for a {class}`~pytask.PickleNode` that uses cloudpickle instead of +the normal {mod}`pickle` module. ```{literalinclude} ../../../docs_src/how_to_guides/the_data_catalog.py ``` ## Changing the name and the default path -By default, the data catalogs store their data in a directory `.pytask/data_catalogs`. -If you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the +By default, data catalogs store their data in a directory `.pytask/data_catalogs`. If +you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the `.pytask` folder is in the same folder as the configuration file. The default name for a catalog is `"default"` and so you will find its data in diff --git a/docs/source/tutorials/using_a_data_catalog.md b/docs/source/tutorials/using_a_data_catalog.md index d704421d..d24d64d0 100644 --- a/docs/source/tutorials/using_a_data_catalog.md +++ b/docs/source/tutorials/using_a_data_catalog.md @@ -10,14 +10,14 @@ Two things will quickly become a nuisance in bigger projects. they are just intermediate representations. As a solution, pytask offers a {class}`~pytask.DataCatalog` which is a purely optional -feature. The tutorial focuses on the main features. To learn about all features, read -the [how-to guide](../how_to_guides/the_data_catalog.md). +feature. The tutorial focuses on the main features. To learn about all the features, +read the [how-to guide](../how_to_guides/the_data_catalog.md). Let us focus on the previous example and see how the {class}`~pytask.DataCatalog` helps us. -The project structure is the same as in the previous example with the exception of the -`.pytask` folder and the missing `data.pkl` in `bld`. +The project structure is the same as in the previous example except the `.pytask` folder +and the missing `data.pkl` in `bld`. ```text my_project @@ -44,15 +44,51 @@ At first, we define the data catalog in `config.py`. ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_1.py ``` -## `task_data_preparation` +## `task_create_random_data` -Next, we will use the data catalog to save the product of the task in -`task_data_preparation.py`. +Next, we look at the module `task_data_preparation.py` and its task +`task_create_random_data`. The task creates a dataframe with simulated data that should +be stored on the disk. -Instead of using a path, we set the location of the product in the data catalog with -`data_catalog["data"]`. If the key does not exist, the data catalog will automatically -create a {class}`~pytask.PickleNode` that allows you to save any Python object to a -`pickle` file. The `pickle` file is stored within the `.pytask` folder. +In the previous tutorial, we learned to use {class}`~pathlib.Path`s to define products +of our tasks. Here we see again the signature of the task function. + +`````{tab-set} + +````{tab-item} Python 3.10+ +:sync: python310plus + +```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_py310.py +:lines: 10-12 +``` +```` + +````{tab-item} Python 3.8+ +:sync: python38plus + +```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_py38.py +:lines: 10-12 +``` +```` + +````{tab-item} produces +:sync: produces + +```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_products_produces.py +:lines: 8 +``` +```` +````` + +When we want to use the data catalog, we replace `BLD / "data.pkl"` with an entry of the +data catalog like `data_catalog["data"]`. If there is yet no entry with the name +`"data"`, the data catalog will automatically create a {class}`~pytask.PickleNode`. The +node allows you to save any Python object to a `pickle` file. + +You probably noticed that we did not need to define a path. That is because the data +catalog takes care of that and stores the `pickle` file in the `.pytask` folder. + +Using `data_catalog["data"]` is thus equivalent to using `PickleNode(path=Path(...))`. The following tabs show you how to use the data catalog given the interface you prefer. @@ -125,10 +161,6 @@ Following one of the interfaces gives you immediate access to the ````{tab-item} Python 3.10+ :sync: python310plus -Use `data_catalog["data"]` as an default argument to access the -{class}`~pytask.PickleNode` within the task. When you are done transforming your -{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. - ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_3_py310.py :emphasize-lines: 12 ``` @@ -138,10 +170,6 @@ Use `data_catalog["data"]` as an default argument to access the ````{tab-item} Python 3.8+ :sync: python38plus -Use `data_catalog["data"]` as an default argument to access the -{class}`~pytask.PickleNode` within the task. When you are done transforming your -{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. - ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_3_py38.py :emphasize-lines: 12 ``` @@ -160,7 +188,8 @@ In most projects, you have other data sets that you would like to access via the catalog. To add them, call the {meth}`~pytask.DataCatalog.add` method and supply a name and a path. -Let's add `file.csv` to the data catalog. +Let's add `file.csv` with the name `"csv"` to the data catalog and use it to create +`data["transformed_csv"]`. ```text my_project @@ -174,8 +203,6 @@ my_project │ ├────task_data_preparation.py │ └────task_plot_data.py │ -├───setup.py -│ ├───.pytask │ └────... │ @@ -184,13 +211,24 @@ my_project └────plot.png ``` -The path can be absolute or relative to the module of the data catalog. +We can use a relative or an absolute path to define the location of the file. A relative +path means the location is relative to the module of the data catalog. ```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_4.py ``` -You can now use the data catalog as in previous example and use the -{class}`~~pathlib.Path` in the task. +You can now use the data catalog as in the previous example and use the +{class}`~pathlib.Path` in the task. + +```{note} +Note that the value of `data_catalog["csv"]` inside the task becomes a +{class}`~pathlib.Path`. It is because a {class}`~pathlib.Path` in +{meth}`~pytask.DataCatalog.add` is not parsed to a {class}`~pytask.PickleNode` but a +{class}`~pytask.PathNode`. + +Read {doc}`../how_to_guides/writing_custom_nodes` for more information about +different node types which is not relevant now. +``` `````{tab-set} @@ -224,9 +262,14 @@ You can now use the data catalog as in previous example and use the ## Developing with the `DataCatalog` -You can also use the data catalog in a Jupyter notebook or in the terminal in the Python -interpreter. Simply import the data catalog, select a node and call the -{meth}`~pytask.PNode.load` method of a node to access its value. +You can also use the data catalog in a Jupyter Notebook or the terminal in the Python +interpreter. This can be super helpful when you develop tasks interactively in a Jupyter +Notebook. + +Simply import the data catalog, select a node and call the {meth}`~pytask.PNode.load` +method of a node to access its value. + +Here is an example with a terminal. ```pycon >>> from myproject.config import data_catalog diff --git a/docs_src/tutorials/using_a_data_catalog_4.py b/docs_src/tutorials/using_a_data_catalog_4.py index 125bbdf5..fd49e88d 100644 --- a/docs_src/tutorials/using_a_data_catalog_4.py +++ b/docs_src/tutorials/using_a_data_catalog_4.py @@ -10,4 +10,3 @@ # Use either a relative or a absolute path. data_catalog.add("csv", Path("file.csv")) -data_catalog.add("transformed_csv", BLD / "file.pkl") diff --git a/pyproject.toml b/pyproject.toml index f864f715..8a11a25c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ docs = [ "ipython", "matplotlib", "myst-parser", - "nbsphinx", + "myst-nb", "sphinx", "sphinx-click", "sphinx-copybutton", @@ -92,6 +92,10 @@ build-backend = "hatchling.build" managed = true dev-dependencies = ["tox-uv>=1.7.0"] +[tool.rye.scripts] +clean-docs = { cmd = "rm -rf docs/build" } +build-docs = { cmd = "sphinx-build -b html docs/source docs/build" } + [tool.hatch.build.hooks.vcs] version-file = "src/_pytask/_version.py" diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index 615fc021..e97f0e55 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -91,10 +91,8 @@ def __getitem__(self, name: str) -> PNode | PProvisionalNode: self.add(name) return self.entries[name] - def add(self, name: str, node: PNode | PProvisionalNode | None = None) -> None: + def add(self, name: str, node: PNode | PProvisionalNode | Any = None) -> None: """Add an entry to the data catalog.""" - assert isinstance(self.path, Path) - if not isinstance(name, str): msg = "The name of a catalog entry must be a string." raise TypeError(msg) @@ -107,7 +105,7 @@ def add(self, name: str, node: PNode | PProvisionalNode | None = None) -> None: ) else: self.entries[name] = self.default_node(name=name) # type: ignore[call-arg] - self.path.joinpath(f"{filename}-node.pkl").write_bytes( + self.path.joinpath(f"{filename}-node.pkl").write_bytes( # type: ignore[union-attr] pickle.dumps(self.entries[name]) ) elif isinstance(node, (PNode, PProvisionalNode)): From 883a6e8da4bbf72da6db4ff0639fff91a9a8371f Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 24 Apr 2024 23:20:02 +0200 Subject: [PATCH 2/9] fix. --- docs/source/changes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/changes.md b/docs/source/changes.md index 216901a7..c2a255fa 100644 --- a/docs/source/changes.md +++ b/docs/source/changes.md @@ -47,6 +47,7 @@ releases are available on [PyPI](https://pypi.org/project/pytask) and - {pull}`603` fixes an example in the documentation about capturing warnings. - {pull}`604` fixes some examples with `PythonNode`s in the documentation. - {pull}`605` improves checks and CI. +- {pull}`606` improves the documentation for data catalogs. ## 0.4.7 - 2024-03-19 From 7f7dd064dc1b13aa64628fef55987cd9c1e0407a Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 4 May 2024 00:33:56 +0200 Subject: [PATCH 3/9] Fix. --- docs/source/reference_guides/api.md | 2 ++ tox.ini | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/reference_guides/api.md b/docs/source/reference_guides/api.md index 4ec74a8e..d06d90b5 100644 --- a/docs/source/reference_guides/api.md +++ b/docs/source/reference_guides/api.md @@ -228,7 +228,9 @@ Task are currently represented by the following classes: ```{eval-rst} .. autoclass:: pytask.Task + :members: .. autoclass:: pytask.TaskWithoutPath + :members: ``` Currently, there are no different types of tasks since changing the `.function` diff --git a/tox.ini b/tox.ini index 4e82bf7f..f6f1319f 100644 --- a/tox.ini +++ b/tox.ini @@ -9,7 +9,7 @@ package = editable [testenv:docs] extras = docs, test commands = - - sphinx-build -n -T -b html -d {envtmpdir}/doctrees docs/source docs/build/html + sphinx-build -n -T -b html -d {envtmpdir}/doctrees docs/source docs/build/html - sphinx-build -n -T -b doctest -d {envtmpdir}/doctrees docs/source docs/build/html [testenv:typing] From 2177d0ea626575eec30cba44c10fb1ea38dd958a Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 4 May 2024 00:39:12 +0200 Subject: [PATCH 4/9] Make entries private. --- src/_pytask/data_catalog.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index e97f0e55..c06913c9 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -47,9 +47,6 @@ class DataCatalog: A default node for loading and saving values. By default, :class:`~pytask.PickleNode` is used to serialize any Python object with the :mod:`pickle` module. - entries - A collection of entries in the catalog. Entries can be :class:`~pytask.PNode` or - a :class:`DataCatalog` itself for nesting catalogs. name The name of the data catalog. Use it when you are working with multiple data catalogs that store data under the same keys. @@ -60,13 +57,13 @@ class DataCatalog: """ default_node: type[PNode] = PickleNode - entries: dict[str, PNode | PProvisionalNode] = field(factory=dict) name: str = "default" path: Path | None = None + _entries: dict[str, PNode | PProvisionalNode] = field(factory=dict) + _instance_path: Path = field(factory=_get_parent_path_of_data_catalog_module) _session_config: dict[str, Any] = field( factory=lambda *x: {"check_casing_of_paths": True} # noqa: ARG005 ) - _instance_path: Path = field(factory=_get_parent_path_of_data_catalog_module) def __attrs_post_init__(self) -> None: root_path, _ = find_project_root_and_config((self._instance_path,)) @@ -77,19 +74,16 @@ def __attrs_post_init__(self) -> None: self.path.mkdir(parents=True, exist_ok=True) - self._initialize() - - def _initialize(self) -> None: - """Initialize the data catalog with persisted nodes from previous runs.""" + # Initialize the data catalog with persisted nodes from previous runs. for path in self.path.glob("*-node.pkl"): # type: ignore[union-attr] node = pickle.loads(path.read_bytes()) # noqa: S301 - self.entries[node.name] = node + self._entries[node.name] = node def __getitem__(self, name: str) -> PNode | PProvisionalNode: """Allow to access entries with the squared brackets syntax.""" - if name not in self.entries: + if name not in self._entries: self.add(name) - return self.entries[name] + return self._entries[name] def add(self, name: str, node: PNode | PProvisionalNode | Any = None) -> None: """Add an entry to the data catalog.""" @@ -100,16 +94,16 @@ def add(self, name: str, node: PNode | PProvisionalNode | Any = None) -> None: if node is None: filename = hashlib.sha256(name.encode()).hexdigest() if isinstance(self.default_node, PPathNode): - self.entries[name] = self.default_node( + self._entries[name] = self.default_node( name=name, path=self.path / f"{filename}.pkl" ) else: - self.entries[name] = self.default_node(name=name) # type: ignore[call-arg] + self._entries[name] = self.default_node(name=name) # type: ignore[call-arg] self.path.joinpath(f"{filename}-node.pkl").write_bytes( # type: ignore[union-attr] - pickle.dumps(self.entries[name]) + pickle.dumps(self._entries[name]) ) elif isinstance(node, (PNode, PProvisionalNode)): - self.entries[name] = node + self._entries[name] = node else: # Acquire the latest pluginmanager. session = Session(config=self._session_config, hook=storage.get().hook) @@ -123,4 +117,4 @@ def add(self, name: str, node: PNode | PProvisionalNode | Any = None) -> None: if collected_node is None: # pragma: no cover msg = f"{node!r} cannot be parsed." raise NodeNotCollectedError(msg) - self.entries[name] = collected_node + self._entries[name] = collected_node From 5004c35718081f5151651007f447e05a469116e4 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 4 May 2024 01:11:11 +0200 Subject: [PATCH 5/9] Fix errors in doctest. --- docs/source/reference_guides/api.md | 3 +++ src/_pytask/nodes.py | 1 + src/_pytask/outcomes.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/reference_guides/api.md b/docs/source/reference_guides/api.md index d06d90b5..56204c28 100644 --- a/docs/source/reference_guides/api.md +++ b/docs/source/reference_guides/api.md @@ -327,6 +327,9 @@ resolution and execution. An indicator to mark arguments of tasks as products. + >>> from pathlib import Path + >>> from pytask import Product + >>> from typing_extensions import Annotated >>> def task_example(path: Annotated[Path, Product]) -> None: ... path.write_text("Hello, World!") diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index 24b4d207..4c678d9d 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -226,6 +226,7 @@ class PythonNode(PNode): own hashing function. For example, from the :mod:`deepdiff` library. >>> from deepdiff import DeepHash + >>> from pytask import PythonNode >>> node = PythonNode(name="node", value={"a": 1}, hash=lambda x: DeepHash(x)[x]) .. warning:: Hashing big objects can require some time. diff --git a/src/_pytask/outcomes.py b/src/_pytask/outcomes.py index 524f0975..f39151b6 100644 --- a/src/_pytask/outcomes.py +++ b/src/_pytask/outcomes.py @@ -174,7 +174,7 @@ def count_outcomes( Examples -------- - >>> from _pytask.outcomes import CollectionOutcome, TaskOutcome + >>> from _pytask.outcomes import CollectionOutcome, TaskOutcome, count_outcomes >>> count_outcomes([], CollectionOutcome) {: 0, : 0} From d8894b8d64cfdf16361d0399616e1ce72b7649fd Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 4 May 2024 01:21:57 +0200 Subject: [PATCH 6/9] fix. --- src/_pytask/data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index c06913c9..c906d701 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -75,7 +75,7 @@ def __attrs_post_init__(self) -> None: self.path.mkdir(parents=True, exist_ok=True) # Initialize the data catalog with persisted nodes from previous runs. - for path in self.path.glob("*-node.pkl"): # type: ignore[union-attr] + for path in self.path.glob("*-node.pkl"): node = pickle.loads(path.read_bytes()) # noqa: S301 self._entries[node.name] = node From 4480d781a012d51330490190992b0b35c26cfd7e Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sun, 12 May 2024 15:21:00 +0200 Subject: [PATCH 7/9] Document restriction to data catalog names. --- docs/source/how_to_guides/the_data_catalog.md | 4 ++++ src/_pytask/data_catalog.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/source/how_to_guides/the_data_catalog.md b/docs/source/how_to_guides/the_data_catalog.md index f7f592ca..fa93f8e4 100644 --- a/docs/source/how_to_guides/the_data_catalog.md +++ b/docs/source/how_to_guides/the_data_catalog.md @@ -41,6 +41,10 @@ The default name for a catalog is `"default"` and so you will find its data in data_catalog = DataCatalog(name="data_management") ``` +```{note} +The name of a data catalog is restricted to letters, numbers, hyphens and underscores. +``` + You can also change the path where the data catalogs will be stored by changing the `path` attribute. Here, we store the data catalog's data next to the module where the data catalog is defined in `.data`. diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index c906d701..ebe2f4b7 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -48,8 +48,9 @@ class DataCatalog: :class:`~pytask.PickleNode` is used to serialize any Python object with the :mod:`pickle` module. name - The name of the data catalog. Use it when you are working with multiple data - catalogs that store data under the same keys. + The name of the data catalog which can only contain letters, numbers, hyphens + and underscores. Use it when you are working with multiple data catalogs to + store data in different locations. path A path where automatically created files are stored. By default, it will be ``.pytask/data_catalogs/default``. From 1b4df364919d28213666adeaddde150b7ea248eb Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sun, 12 May 2024 15:42:57 +0200 Subject: [PATCH 8/9] Add exampe for nested data catalogs. --- docs/source/how_to_guides/the_data_catalog.md | 62 ++++++++++++++++++- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/docs/source/how_to_guides/the_data_catalog.md b/docs/source/how_to_guides/the_data_catalog.md index fa93f8e4..74e3db7c 100644 --- a/docs/source/how_to_guides/the_data_catalog.md +++ b/docs/source/how_to_guides/the_data_catalog.md @@ -14,6 +14,7 @@ For example, use the {class}`~pytask.PythonNode` as the default. ```python from pytask import PythonNode +from pytask import DataCatalog data_catalog = DataCatalog(default_node=PythonNode) @@ -38,6 +39,9 @@ The default name for a catalog is `"default"` and so you will find its data in `"data_management"`, you will find the data in `.pytask/data_catalogs/data_management`. ```python +from pytask import DataCatalog + + data_catalog = DataCatalog(name="data_management") ``` @@ -51,6 +55,7 @@ data catalog is defined in `.data`. ```python from pathlib import Path +from pytask import DataCatalog data_catalog = DataCatalog(path=Path(__file__).parent / ".data") @@ -58,14 +63,15 @@ data_catalog = DataCatalog(path=Path(__file__).parent / ".data") ## Multiple data catalogs -You can use multiple data catalogs when you want to separate your datasets across -multiple catalogs or when you want to use the same names multiple times (although it is -not recommended!). +You can use multiple data catalogs when you want to separate your datasets or to avoid +name collisions of data catalog entries. Make sure you assign different names to the data catalogs so that their data is stored in different directories. ```python +from pytask import DataCatalog + # Stored in .pytask/data_catalog/a data_catalog_a = DataCatalog(name="a") @@ -74,3 +80,53 @@ data_catalog_b = DataCatalog(name="b") ``` Or, use different paths as explained above. + +## Nested data catalogs + +Name collisions can also occur when you are using multiple levels of repetitions, for +example, when you are fitting multiple models to multiple data sets. + +You can structure your data catalogs like this. + +```python +from pytask import DataCatalog + + +MODEL_NAMES = ("ols", "logistic_regression") +DATA_NAMES = ("data_1", "data_2") + + +nested_data_catalogs = { + model_name: { + data_name: DataCatalog(name=f"{model_name}-{data_name}") + for data_name in DATA_NAMES + } + for model_name in MODEL_NAMES +} +``` + +The task could look like this. + +```python +from pathlib import Path +from pytask import task +from typing_extensions import Annotated + +from my_project.config import DATA_NAMES +from my_project.config import MODEL_NAMES +from my_project.config import nested_data_catalogs + + +for model_name in MODEL_NAMES: + for data_name in DATA_NAMES: + + @task + def fit_model( + path: Path = Path("...", data_name) + ) -> Annotated[ + Any, nested_data_catalogs[model_name][data_name]["fitted_model"] + ]: + data = ... + fitted_model = ... + return fitted_model +``` From 13173f49c41e66fd0d75b72b4690f08702a67b4c Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sun, 12 May 2024 15:59:39 +0200 Subject: [PATCH 9/9] Fix. --- src/_pytask/data_catalog.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index 730558bb..8a9a08cd 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -59,7 +59,6 @@ class DataCatalog: """ default_node: type[PNode] = PickleNode - entries: dict[str, PNode | PProvisionalNode] = field(factory=dict) name: str = field(default="default") path: Path | None = None _entries: dict[str, PNode | PProvisionalNode] = field(factory=dict)