From 82859e7a683a153257c0fcdd673a847829270a52 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Mon, 18 Sep 2023 17:42:44 +0200 Subject: [PATCH 01/28] Draft datastore. --- src/_pytask/datastore.py | 66 ++++++++++++++++++++++++++++++++++++++++ src/pytask/__init__.py | 2 ++ 2 files changed, 68 insertions(+) create mode 100644 src/_pytask/datastore.py diff --git a/src/_pytask/datastore.py b/src/_pytask/datastore.py new file mode 100644 index 00000000..8866298a --- /dev/null +++ b/src/_pytask/datastore.py @@ -0,0 +1,66 @@ +"""Contains the implementation of a datastore. + +The datastore is an abstraction layer between users and nodes. + +""" +from __future__ import annotations + +import hashlib +import pickle +from pathlib import Path +from typing import Any +from typing import TYPE_CHECKING + +from attrs import define +from attrs import Factory + +if TYPE_CHECKING: + from _pytask.node_protocols import PNode + + +__all__ = ["DataStore", "PickleNode"] + + +@define +class DataStore: + directory: Path = Path.cwd().joinpath(".pytask").resolve() + nodes: dict[str, PNode] = Factory(dict) + + def __getitem__(self, name: str) -> PNode: + if name not in self.nodes: + self.add(name) + return self.nodes[name] + + def add(self, name: str, node: PNode | None = None) -> None: + if not isinstance(name, str): + msg = "The name of a datastore entry must be a string." + raise TypeError(msg) + + if name in self.nodes: + msg = f"There is already an entry with the name {name!r} in the datastore." + raise ValueError(msg) + + if node is None: + filename = str(hashlib.sha256(name.encode()).hexdigest()) + self.nodes[name] = PickleNode( + name=name, path=self.directory / f"{filename}.pkl" + ) + else: + self.nodes[name] = node + + +@define +class PickleNode: + name: str + path: Path + + def state(self) -> str | None: + if self.path.exists(): + return str(self.path.stat().st_mtime) + return None + + def load(self) -> Any: + return pickle.loads(self.path.read_bytes()) # noqa: S301 + + def save(self, value: Any) -> None: + self.path.write_bytes(pickle.dumps(value)) diff --git a/src/pytask/__init__.py b/src/pytask/__init__.py index 2a88d423..887a86b8 100644 --- a/src/pytask/__init__.py +++ b/src/pytask/__init__.py @@ -17,6 +17,7 @@ from _pytask.database_utils import create_database from _pytask.database_utils import DatabaseSession from _pytask.database_utils import State +from _pytask.datastore import DataStore from _pytask.exceptions import CollectionError from _pytask.exceptions import ConfigurationError from _pytask.exceptions import ExecutionError @@ -84,6 +85,7 @@ "ConfigurationError", "DagReport", "DatabaseSession", + "DataStore", "EnumChoice", "ExecutionError", "ExecutionReport", From 4001f981ddd8a030b77eef23599658bf6e85c86f Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Mon, 18 Sep 2023 18:59:06 +0200 Subject: [PATCH 02/28] Finish draft of data catalog. --- docs/source/how_to_guides/index.md | 1 + .../how_to_guides/using_a_data_catalog.md | 42 +++++++++++++ docs/source/reference_guides/api.md | 6 +- src/_pytask/datastore.py | 62 +++++++++---------- src/_pytask/nodes.py | 37 ++++++++++- src/pytask/__init__.py | 6 +- 6 files changed, 116 insertions(+), 38 deletions(-) create mode 100644 docs/source/how_to_guides/using_a_data_catalog.md diff --git a/docs/source/how_to_guides/index.md b/docs/source/how_to_guides/index.md index 7f3c9aa5..eacee6e3 100644 --- a/docs/source/how_to_guides/index.md +++ b/docs/source/how_to_guides/index.md @@ -16,6 +16,7 @@ invoking_pytask_extended capture_warnings how_to_influence_build_order how_to_write_a_plugin +using_a_data_catalog ``` ## Best Practice Guides diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md new file mode 100644 index 00000000..f049eb07 --- /dev/null +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -0,0 +1,42 @@ +# Using a data catalog + +A data catalog is an abstraction layer to ease the data access for the user. + +The following example shows: + +- How data access is facilitated by a flat namespace in the data catalog. +- How some files like `_DataCatalog["new_content"]` are automatically saved whenever the + user does not really care about where they are stored. + +```python +"""Contains an example of how to use the data catalog.""" +from pathlib import Path +from typing import Annotated +from _pytask.nodes import PathNode +from pytask import DataCatalog + + +_SRC = Path(__file__).parent.resolve() + +# Generate input data +# _SRC.joinpath("file.txt").write_text("Hello, ") + + +_DataCatalog = DataCatalog() +_DataCatalog.add("file", PathNode.from_path(_SRC / "file.txt")) +_DataCatalog.add("output", PathNode.from_path(_SRC / "output.txt")) + + +def task_add_content( + path: Annotated[Path, _DataCatalog["file"]] +) -> Annotated[str, _DataCatalog["new_content"]]: + text = path.read_text() + text += "World!" + return text + + +def task_save_text( + text: Annotated[str, _DataCatalog["new_content"]] +) -> Annotated[str, _DataCatalog["output"]]: + return text +``` diff --git a/docs/source/reference_guides/api.md b/docs/source/reference_guides/api.md index a39fe280..07edaf77 100644 --- a/docs/source/reference_guides/api.md +++ b/docs/source/reference_guides/api.md @@ -232,6 +232,7 @@ The remaining exceptions convey specific errors. ```{eval-rst} .. autoclass:: pytask.Session +.. autoclass:: pytask.DataCatalog ``` ## Nodes @@ -248,11 +249,12 @@ Then, different kinds of nodes can be implemented. ```{eval-rst} .. autoclass:: pytask.PathNode :members: -``` -```{eval-rst} .. autoclass:: pytask.PythonNode :members: + +.. autoclass:: pytask.PickleNode + :members: ``` To parse dependencies and products from nodes, use the following functions. diff --git a/src/_pytask/datastore.py b/src/_pytask/datastore.py index 8866298a..181a2597 100644 --- a/src/_pytask/datastore.py +++ b/src/_pytask/datastore.py @@ -1,16 +1,15 @@ -"""Contains the implementation of a datastore. +"""Contains the implementation of a data catalog. -The datastore is an abstraction layer between users and nodes. +The data catalog is an abstraction layer between users and nodes. """ from __future__ import annotations import hashlib -import pickle from pathlib import Path -from typing import Any from typing import TYPE_CHECKING +from _pytask.nodes import PickleNode from attrs import define from attrs import Factory @@ -18,49 +17,46 @@ from _pytask.node_protocols import PNode -__all__ = ["DataStore", "PickleNode"] +__all__ = ["DataCatalog"] @define -class DataStore: +class DataCatalog: + """A data catalog. + + Parameters + ---------- + directory + A directory where automatically created files are stored. + entries + A collection of entries in the catalog. Entries can be :class:`pytask.PNode` or + a :class:`DataCatalog` itself for nesting catalogs. + + """ + directory: Path = Path.cwd().joinpath(".pytask").resolve() - nodes: dict[str, PNode] = Factory(dict) + entries: dict[str, DataCatalog | PNode] = Factory(dict) - def __getitem__(self, name: str) -> PNode: - if name not in self.nodes: + def __getitem__(self, name: str) -> DataCatalog | PNode: + if name not in self.entries: self.add(name) - return self.nodes[name] + return self.entries[name] - def add(self, name: str, node: PNode | None = None) -> None: + def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: if not isinstance(name, str): - msg = "The name of a datastore entry must be a string." + msg = "The name of a catalog entry must be a string." raise TypeError(msg) - if name in self.nodes: - msg = f"There is already an entry with the name {name!r} in the datastore." + if name in self.entries: + msg = ( + f"There is already an entry with the name {name!r} in the data catalog." + ) raise ValueError(msg) if node is None: filename = str(hashlib.sha256(name.encode()).hexdigest()) - self.nodes[name] = PickleNode( + self.entries[name] = PickleNode( name=name, path=self.directory / f"{filename}.pkl" ) else: - self.nodes[name] = node - - -@define -class PickleNode: - name: str - path: Path - - def state(self) -> str | None: - if self.path.exists(): - return str(self.path.stat().st_mtime) - return None - - def load(self) -> Any: - return pickle.loads(self.path.read_bytes()) # noqa: S301 - - def save(self, value: Any) -> None: - self.path.write_bytes(pickle.dumps(value)) + self.entries[name] = node diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index a82f42a1..9601d3aa 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -4,7 +4,7 @@ import functools import hashlib import inspect -from pathlib import Path # noqa: TCH003 +import pickle from typing import Any from typing import Callable from typing import TYPE_CHECKING @@ -18,6 +18,7 @@ if TYPE_CHECKING: + from pathlib import Path from _pytask.tree_util import PyTree from _pytask.mark import Mark @@ -199,3 +200,37 @@ def state(self) -> str | None: return str(hashlib.sha256(self.value.encode()).hexdigest()) return str(hash(self.value)) return "0" + + +@define +class PickleNode: + """A node for pickle files.""" + + name: str + """Name of the node which makes it identifiable in the DAG.""" + path: Path + """The path to the file.""" + + @classmethod + @functools.lru_cache + def from_path(cls, path: Path) -> PickleNode: + """Instantiate class from path to file. + + The `lru_cache` decorator ensures that the same object is not collected twice. + + """ + if not path.is_absolute(): + msg = "Node must be instantiated from absolute path." + raise ValueError(msg) + return cls(name=path.as_posix(), path=path) + + def state(self) -> str | None: + if self.path.exists(): + return str(self.path.stat().st_mtime) + return None + + def load(self) -> Any: + return pickle.loads(self.path.read_bytes()) # noqa: S301 + + def save(self, value: Any) -> None: + self.path.write_bytes(pickle.dumps(value)) diff --git a/src/pytask/__init__.py b/src/pytask/__init__.py index 887a86b8..0e23b9cf 100644 --- a/src/pytask/__init__.py +++ b/src/pytask/__init__.py @@ -17,7 +17,7 @@ from _pytask.database_utils import create_database from _pytask.database_utils import DatabaseSession from _pytask.database_utils import State -from _pytask.datastore import DataStore +from _pytask.datastore import DataCatalog from _pytask.exceptions import CollectionError from _pytask.exceptions import ConfigurationError from _pytask.exceptions import ExecutionError @@ -43,6 +43,7 @@ from _pytask.node_protocols import PTask from _pytask.node_protocols import PTaskWithPath from _pytask.nodes import PathNode +from _pytask.nodes import PickleNode from _pytask.nodes import PythonNode from _pytask.nodes import Task from _pytask.outcomes import CollectionOutcome @@ -85,7 +86,7 @@ "ConfigurationError", "DagReport", "DatabaseSession", - "DataStore", + "DataCatalog", "EnumChoice", "ExecutionError", "ExecutionReport", @@ -101,6 +102,7 @@ "NodeNotFoundError", "PathNode", "Persisted", + "PickleNode", "PNode", "PPathNode", "Product", From 93e6da0f3ef780d0b8a49d912dff3fc0a41a76b2 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Mon, 18 Sep 2023 21:04:33 +0200 Subject: [PATCH 03/28] Fix tests. --- src/_pytask/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index 9601d3aa..38057ef9 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -5,6 +5,7 @@ import hashlib import inspect import pickle +from pathlib import Path # noqa: TCH003 from typing import Any from typing import Callable from typing import TYPE_CHECKING @@ -18,7 +19,6 @@ if TYPE_CHECKING: - from pathlib import Path from _pytask.tree_util import PyTree from _pytask.mark import Mark From 9524c3036aee86e48ebd828f7b5eb8ef5b99335e Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Tue, 19 Sep 2023 07:56:28 +0200 Subject: [PATCH 04/28] more data catalog. --- docs/source/conf.py | 1 + .../how_to_guides/using_a_data_catalog.md | 65 +++++++++++++------ docs/source/reference_guides/api.md | 1 + src/_pytask/{datastore.py => data_catalog.py} | 4 +- src/_pytask/nodes.py | 65 +++++++++++++++---- src/pytask/__init__.py | 2 +- task_data_catalog.py | 31 +++++++++ 7 files changed, 133 insertions(+), 36 deletions(-) rename src/_pytask/{datastore.py => data_catalog.py} (89%) create mode 100644 task_data_catalog.py diff --git a/docs/source/conf.py b/docs/source/conf.py index ed04c41a..e0411ada 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -74,6 +74,7 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3.9", None), "click": ("https://click.palletsprojects.com/en/8.0.x/", None), + "deepdiff": ("https://zepworks.com/deepdiff/current/", None), "pluggy": ("https://pluggy.readthedocs.io/en/latest", None), "networkx": ("https://networkx.org/documentation/stable", None), "pygraphviz": ("https://pygraphviz.github.io/documentation/stable/", None), diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md index f049eb07..46225c84 100644 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -1,42 +1,65 @@ # Using a data catalog -A data catalog is an abstraction layer to ease the data access for the user. +A data catalog is an inventory for data in your project. It has two main advantages: -The following example shows: +- A data catalog provides an interface to easily access the data. +- A data catalog can take care of saving a task product. -- How data access is facilitated by a flat namespace in the data catalog. -- How some files like `_DataCatalog["new_content"]` are automatically saved whenever the - user does not really care about where they are stored. +## Using the data catalog + +As an example, we build a workflow comprising of two tasks that do the following +actions. + +1. Read in data from a text file, `input.txt` and storing it as a pickle file. +1. Read the data from pickle, adding additional text and storing it as a text file under + `output.txt`. + +At first, we build the data catalog by registering the data that we provide or that we +later want to access. ```python -"""Contains an example of how to use the data catalog.""" from pathlib import Path -from typing import Annotated -from _pytask.nodes import PathNode + from pytask import DataCatalog +from pytask import PathNode + + +# Get the path of the parent directory of the file. +ROOT = Path(__file__).parent.resolve() -_SRC = Path(__file__).parent.resolve() +# We store the data in .pytask/ +OurDataCatalog = DataCatalog(directory=ROOT / ".pytask") -# Generate input data -# _SRC.joinpath("file.txt").write_text("Hello, ") +# Register the input and the output data. +OurDataCatalog.add("input", ROOT / "input.txt") +OurDataCatalog.add("output", ROOT / "output.txt") +``` +We also have to create `input.txt` and add some content like `Hello, `. -_DataCatalog = DataCatalog() -_DataCatalog.add("file", PathNode.from_path(_SRC / "file.txt")) -_DataCatalog.add("output", PathNode.from_path(_SRC / "output.txt")) +We do not register the intermediate pickle file, yet. +Next, let us define the two tasks. -def task_add_content( - path: Annotated[Path, _DataCatalog["file"]] -) -> Annotated[str, _DataCatalog["new_content"]]: +```python +def task_save_text_with_pickle( + path: Annotated[Path, OurDataCatalog["input"]] +) -> Annotated[str, OurDataCatalog["pickle_file"]]: text = path.read_text() - text += "World!" return text -def task_save_text( - text: Annotated[str, _DataCatalog["new_content"]] -) -> Annotated[str, _DataCatalog["output"]]: +def task_add_content_and_save_text( + text: Annotated[str, OurDataCatalog["pickle_file"]] +) -> Annotated[str, OurDataCatalog["output"]]: + text += "World!" return text ``` + +The important bit here is that we reference the intermediate pickle file with +`OurDataCatalog["pickle_file"]`. Since the entry does not exist, the catalog creates a +{class}`~pytask.PickleNode` for this entry and saves the pickle file in the directory +given to the {class}`~pytask.DataCatalog`. + +## Changing the default node diff --git a/docs/source/reference_guides/api.md b/docs/source/reference_guides/api.md index 07edaf77..e6c596b3 100644 --- a/docs/source/reference_guides/api.md +++ b/docs/source/reference_guides/api.md @@ -233,6 +233,7 @@ The remaining exceptions convey specific errors. ```{eval-rst} .. autoclass:: pytask.Session .. autoclass:: pytask.DataCatalog + :members: ``` ## Nodes diff --git a/src/_pytask/datastore.py b/src/_pytask/data_catalog.py similarity index 89% rename from src/_pytask/datastore.py rename to src/_pytask/data_catalog.py index 181a2597..d8ec96ce 100644 --- a/src/_pytask/datastore.py +++ b/src/_pytask/data_catalog.py @@ -34,15 +34,17 @@ class DataCatalog: """ - directory: Path = Path.cwd().joinpath(".pytask").resolve() + directory: Path = Factory(lambda *x: Path.cwd().joinpath(".pytask").resolve()) entries: dict[str, DataCatalog | PNode] = Factory(dict) def __getitem__(self, name: str) -> DataCatalog | PNode: + """Allow to access entries with the squared brackets syntax.""" if name not in self.entries: self.add(name) return self.entries[name] def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: + """Add an entry to the data catalog.""" if not isinstance(name, str): msg = "The name of a catalog entry must be a string." raise TypeError(msg) diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index 38057ef9..098724fc 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -114,12 +114,17 @@ def execute(self, **kwargs: Any) -> None: @define(kw_only=True) class PathNode(PPathNode): - """The class for a node which is a path.""" + """The class for a node which is a path. + + name + Name of the node which makes it identifiable in the DAG. + path + The path to the file. + + """ name: str - """Name of the node which makes it identifiable in the DAG.""" path: Path - """The path to the file.""" @classmethod @functools.lru_cache @@ -161,14 +166,35 @@ def save(self, value: bytes | str) -> None: @define(kw_only=True) class PythonNode(PNode): - """The class for a node which is a Python object.""" + """The class for a node which is a Python object. + + Parameters + ---------- + name + The name of the node. + value + The value of the node. + hash + Whether the value should be hashed to determine the state. Use ``True`` for + objects that are hashable like strings and tuples. For dictionaries and other + non-hashable objects, you need to provide a function that can hash these + objects. + + Examples + -------- + To allow a :class:`~pytask.PythonNode` to hash a dictionary, you need to pass your + own hashing function. For example, from the :mod:`deepdiff` library. + + >>> from deepdiff import DeepHash + >>> node = PythonNode(name="node", value={"a": 1}, hash=lambda x: DeepHash(x)[x]) + + .. warning:: Hashing big objects can require some time. + + """ name: str = "" - """Name of the node.""" value: Any = None - """Value of the node.""" - hash: bool | Callable[[Any], bool] = False # noqa: A003 - """Whether the value should be hashed to determine the state.""" + hash: bool | Callable[[Any], str] = False # noqa: A003 def load(self) -> Any: """Load the value.""" @@ -204,12 +230,25 @@ def state(self) -> str | None: @define class PickleNode: - """A node for pickle files.""" + """A node for pickle files. + + Parameters + ---------- + name + Name of the node which makes it identifiable in the DAG. + path + The path to the file. + load_func + A function to convert :obj:`bytes` from a pickle file to a Python object. + dump_func + A function to convert a Python object to :obj:`bytes`. + + """ name: str - """Name of the node which makes it identifiable in the DAG.""" path: Path - """The path to the file.""" + load_func: Callable[[bytes], Any] = pickle.loads + dump_func: Callable[[Any], bytes] = pickle.dumps @classmethod @functools.lru_cache @@ -230,7 +269,7 @@ def state(self) -> str | None: return None def load(self) -> Any: - return pickle.loads(self.path.read_bytes()) # noqa: S301 + return self.load_func(self.path.read_bytes()) def save(self, value: Any) -> None: - self.path.write_bytes(pickle.dumps(value)) + self.path.write_bytes(self.dump_func(value)) diff --git a/src/pytask/__init__.py b/src/pytask/__init__.py index 0e23b9cf..41fbb9f2 100644 --- a/src/pytask/__init__.py +++ b/src/pytask/__init__.py @@ -13,11 +13,11 @@ from _pytask.compat import import_optional_dependency from _pytask.config import hookimpl from _pytask.console import console +from _pytask.data_catalog import DataCatalog from _pytask.database_utils import BaseTable from _pytask.database_utils import create_database from _pytask.database_utils import DatabaseSession from _pytask.database_utils import State -from _pytask.datastore import DataCatalog from _pytask.exceptions import CollectionError from _pytask.exceptions import ConfigurationError from _pytask.exceptions import ExecutionError diff --git a/task_data_catalog.py b/task_data_catalog.py new file mode 100644 index 00000000..14b6071f --- /dev/null +++ b/task_data_catalog.py @@ -0,0 +1,31 @@ +"""Contains an example of how to use the data catalog.""" +from __future__ import annotations + +from pathlib import Path +from typing import Annotated + +from pytask import DataCatalog + + +_SRC = Path(__file__).parent.resolve() + +# Generate input data + + +_DataCatalog = DataCatalog() +_DataCatalog.add("file", _SRC / "file.txt") +_DataCatalog.add("output", _SRC / "output.txt") + + +def task_add_content( + path: Annotated[Path, _DataCatalog["file"]] +) -> Annotated[str, _DataCatalog["new_content"]]: + text = path.read_text() + text += "World!" + return text + + +def task_save_text( + text: Annotated[str, _DataCatalog["new_content"]] +) -> Annotated[str, _DataCatalog["output"]]: + return text From 891f4a6d0b056abd087d3a877aa979eabf7d4d09 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 14 Oct 2023 16:38:06 +0200 Subject: [PATCH 05/28] Allow to use the datastore independent of pytask. --- .../how_to_guides/hashing_inputs_of_tasks.md | 4 +- environment.yml | 1 + src/_pytask/build.py | 4 +- src/_pytask/config_utils.py | 7 ++- src/_pytask/data_catalog.py | 47 ++++++++++++++++--- src/_pytask/graph.py | 4 +- tests/test_config_utils.py | 8 ++-- 7 files changed, 57 insertions(+), 18 deletions(-) diff --git a/docs/source/how_to_guides/hashing_inputs_of_tasks.md b/docs/source/how_to_guides/hashing_inputs_of_tasks.md index 7f7d0798..e415a166 100644 --- a/docs/source/how_to_guides/hashing_inputs_of_tasks.md +++ b/docs/source/how_to_guides/hashing_inputs_of_tasks.md @@ -62,10 +62,10 @@ from interpreter session to interpreter session for security reasons (see ``` {class}`list` and {class}`dict` are not hashable by default. Luckily, there are -libraries who provide this functionality like `deepdiff`. We can use them to pass a +libraries who provide this functionality like {mod}`deepdiff`. We can use them to pass a function to the {class}`~pytask.PythonNode` that generates a stable hash. -First, install `deepdiff`. +First, install {mod}`deepdiff`. ```console $ pip install deepdiff diff --git a/environment.yml b/environment.yml index c65e57e8..7cb2c236 100644 --- a/environment.yml +++ b/environment.yml @@ -24,6 +24,7 @@ dependencies: # Misc - black + - deepdiff - ipywidgets - jupyterlab - matplotlib diff --git a/src/_pytask/build.py b/src/_pytask/build.py index 11c903f3..1ab47c87 100644 --- a/src/_pytask/build.py +++ b/src/_pytask/build.py @@ -13,7 +13,7 @@ from _pytask.capture import CaptureMethod from _pytask.click import ColoredCommand from _pytask.config import hookimpl -from _pytask.config_utils import _find_project_root_and_config +from _pytask.config_utils import find_project_root_and_config from _pytask.config_utils import read_config from _pytask.console import console from _pytask.exceptions import CollectionError @@ -213,7 +213,7 @@ def build( # noqa: C901, PLR0912, PLR0913, PLR0915 ( raw_config["root"], raw_config["config"], - ) = _find_project_root_and_config(raw_config["paths"]) + ) = find_project_root_and_config(raw_config["paths"]) if raw_config["config"] is not None: config_from_file = read_config(raw_config["config"]) diff --git a/src/_pytask/config_utils.py b/src/_pytask/config_utils.py index 8f3b040c..09da7755 100644 --- a/src/_pytask/config_utils.py +++ b/src/_pytask/config_utils.py @@ -10,6 +10,9 @@ from _pytask.shared import parse_paths +__all__ = ["find_project_root_and_config", "read_config", "set_defaults_from_config"] + + def set_defaults_from_config( context: click.Context, param: click.Parameter, value: Any # noqa: ARG001 ) -> Path | None: @@ -39,7 +42,7 @@ def set_defaults_from_config( ( context.params["root"], context.params["config"], - ) = _find_project_root_and_config(context.params["paths"]) + ) = find_project_root_and_config(context.params["paths"]) if context.params["config"] is None: return None @@ -54,7 +57,7 @@ def set_defaults_from_config( return context.params["config"] -def _find_project_root_and_config(paths: list[Path] | None) -> tuple[Path, Path | None]: +def find_project_root_and_config(paths: list[Path] | None) -> tuple[Path, Path | None]: """Find the project root and configuration file from a list of paths. The process is as follows: diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index d8ec96ce..00cb6dec 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -6,36 +6,65 @@ from __future__ import annotations import hashlib -from pathlib import Path +import pickle from typing import TYPE_CHECKING +from _pytask.config_utils import find_project_root_and_config +from _pytask.node_protocols import PNode +from _pytask.node_protocols import PPathNode from _pytask.nodes import PickleNode from attrs import define from attrs import Factory if TYPE_CHECKING: - from _pytask.node_protocols import PNode + from pathlib import Path __all__ = ["DataCatalog"] -@define +def _find_directory() -> Path: + root_path, _ = find_project_root_and_config(None) + return root_path.joinpath(".pytask", "data_catalogs") + + +@define(kw_only=True) class DataCatalog: """A data catalog. Parameters ---------- + default_node + A default node for loading and saving values. directory A directory where automatically created files are stored. entries A collection of entries in the catalog. Entries can be :class:`pytask.PNode` or a :class:`DataCatalog` itself for nesting catalogs. + name + The name of the data catalog. Use it when you are working with multiple data + catalogs that store data under the same keys. """ - directory: Path = Factory(lambda *x: Path.cwd().joinpath(".pytask").resolve()) + default_node: type[PNode] = PickleNode + directory: Path | None = None entries: dict[str, DataCatalog | PNode] = Factory(dict) + name: str = "default" + + def __attrs_post_init__(self) -> None: + if not self.directory: + root = _find_directory() + self.directory = root / self.name + self.directory.mkdir(parents=True, exist_ok=True) + + self._initialize() + + def _initialize(self) -> None: + """Initialize the data catalog with persisted nodes from previous runs.""" + for path in self.directory.glob("*-node.pkl"): # type: ignore[union-attr] + node = pickle.loads(path.read_bytes()) # noqa: S301 + self.entries[node.name] = node def __getitem__(self, name: str) -> DataCatalog | PNode: """Allow to access entries with the squared brackets syntax.""" @@ -57,8 +86,14 @@ def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: if node is None: filename = str(hashlib.sha256(name.encode()).hexdigest()) - self.entries[name] = PickleNode( - name=name, path=self.directory / f"{filename}.pkl" + if isinstance(self.default_node, PPathNode): + self.entries[name] = self.default_node( + name=name, path=self.directory / f"{filename}.pkl" + ) + else: + self.entries[name] = self.default_node(name=name) # type: ignore[call-arg] + self.directory.joinpath(f"{filename}-node.pkl").write_bytes( # type: ignore[union-attr] + pickle.dumps(self.entries[name]) ) else: self.entries[name] = node diff --git a/src/_pytask/graph.py b/src/_pytask/graph.py index caeeab2d..3853ec27 100644 --- a/src/_pytask/graph.py +++ b/src/_pytask/graph.py @@ -14,7 +14,7 @@ from _pytask.compat import check_for_optional_program from _pytask.compat import import_optional_dependency from _pytask.config import hookimpl -from _pytask.config_utils import _find_project_root_and_config +from _pytask.config_utils import find_project_root_and_config from _pytask.config_utils import read_config from _pytask.console import console from _pytask.exceptions import CollectionError @@ -179,7 +179,7 @@ def build_dag(raw_config: dict[str, Any]) -> nx.DiGraph: ( raw_config["root"], raw_config["config"], - ) = _find_project_root_and_config(raw_config["paths"]) + ) = find_project_root_and_config(raw_config["paths"]) if raw_config["config"] is not None: config_from_file = read_config(raw_config["config"]) diff --git a/tests/test_config_utils.py b/tests/test_config_utils.py index becfd13a..08de22b9 100644 --- a/tests/test_config_utils.py +++ b/tests/test_config_utils.py @@ -3,7 +3,7 @@ from pathlib import Path import pytest -from _pytask.config_utils import _find_project_root_and_config +from _pytask.config_utils import find_project_root_and_config @pytest.mark.unit() @@ -34,7 +34,7 @@ def test_find_project_root_and_config( path.parent.mkdir(exist_ok=True, parents=True) path.touch() - root, config = _find_project_root_and_config(paths) + root, config = find_project_root_and_config(paths) assert root == tmp_path.joinpath(expected_root) if expected_config is None: @@ -51,7 +51,7 @@ def test_find_project_root_and_config( def test_find_project_root_and_config_w_no_intersecting_paths( paths, expected_root, expected_config ): - root, config = _find_project_root_and_config(paths) + root, config = find_project_root_and_config(paths) assert root == expected_root assert config == expected_config @@ -68,7 +68,7 @@ def test_root_stops_at_version_control_folder(tmp_path, vc_folder, path, expecte if vc_folder: tmp_path.joinpath(vc_folder).mkdir(parents=True) - root, ini = _find_project_root_and_config([tmp_path.joinpath(path)]) + root, ini = find_project_root_and_config([tmp_path.joinpath(path)]) assert ini is None assert root == tmp_path.joinpath(expected) From 49e684f4506da027e464df9fef743286ba8937c6 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 14 Oct 2023 16:40:12 +0200 Subject: [PATCH 06/28] Add deepdiff for tests. --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 302b570a..9376372a 100644 --- a/tox.ini +++ b/tox.ini @@ -28,6 +28,7 @@ deps = tomli >=1.0.0 # Optional and test dependencies + deepdiff pexpect linux, macos: pygraphviz From f5948d96c5e54017fe0985c82f4ab6fb387ea7c3 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sun, 15 Oct 2023 23:44:33 +0200 Subject: [PATCH 07/28] Add collection to data store. --- src/_pytask/data_catalog.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index 00cb6dec..6d4cff6b 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -10,9 +10,13 @@ from typing import TYPE_CHECKING from _pytask.config_utils import find_project_root_and_config +from _pytask.exceptions import NodeNotCollectedError +from _pytask.models import NodeInfo from _pytask.node_protocols import PNode from _pytask.node_protocols import PPathNode from _pytask.nodes import PickleNode +from _pytask.pluginmanager import get_plugin_manager +from _pytask.session import Session from attrs import define from attrs import Factory @@ -28,6 +32,12 @@ def _find_directory() -> Path: return root_path.joinpath(".pytask", "data_catalogs") +def _create_default_session() -> Session: + return Session( + config={"check_casing_of_paths": True}, hook=get_plugin_manager().hook + ) + + @define(kw_only=True) class DataCatalog: """A data catalog. @@ -51,6 +61,7 @@ class DataCatalog: directory: Path | None = None entries: dict[str, DataCatalog | PNode] = Factory(dict) name: str = "default" + _session: Session = Factory(_create_default_session) def __attrs_post_init__(self) -> None: if not self.directory: @@ -78,12 +89,6 @@ def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: msg = "The name of a catalog entry must be a string." raise TypeError(msg) - if name in self.entries: - msg = ( - f"There is already an entry with the name {name!r} in the data catalog." - ) - raise ValueError(msg) - if node is None: filename = str(hashlib.sha256(name.encode()).hexdigest()) if isinstance(self.default_node, PPathNode): @@ -95,5 +100,17 @@ def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: self.directory.joinpath(f"{filename}-node.pkl").write_bytes( # type: ignore[union-attr] pickle.dumps(self.entries[name]) ) - else: + elif isinstance(node, PNode): self.entries[name] = node + else: + collected_node = self._session.hook.pytask_collect_node( + session=self._session, + path=None, + node_info=NodeInfo( + arg_name=name, path=(), value=node, task_path=None, task_name="" + ), + ) + if collected_node is None: + msg = f"{node!r} cannot be parsed." + raise NodeNotCollectedError(msg) + self.entries[name] = collected_node From 8457c868636137e0eccf085db5f9aeee0421f881 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Tue, 17 Oct 2023 19:42:21 +0200 Subject: [PATCH 08/28] Add some tests. --- src/_pytask/data_catalog.py | 32 +++++++++++------ tests/test_data_catalog.py | 69 +++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 10 deletions(-) create mode 100644 tests/test_data_catalog.py diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index 6d4cff6b..62fcc9e2 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -6,8 +6,9 @@ from __future__ import annotations import hashlib +import inspect import pickle -from typing import TYPE_CHECKING +from pathlib import Path from _pytask.config_utils import find_project_root_and_config from _pytask.exceptions import NodeNotCollectedError @@ -20,19 +21,27 @@ from attrs import define from attrs import Factory -if TYPE_CHECKING: - from pathlib import Path - __all__ = ["DataCatalog"] -def _find_directory() -> Path: - root_path, _ = find_project_root_and_config(None) +def _find_directory(path: Path) -> Path: + """Find directory where data catalog can store its data.""" + root_path, _ = find_project_root_and_config([path]) return root_path.joinpath(".pytask", "data_catalogs") +def _get_parent_path_of_data_catalog_module(stacklevel: int = 2) -> Path: + """Get the parent path of the module where the data catalog is defined.""" + stack = inspect.stack() + potential_path = stack[stacklevel].frame.f_globals.get("__file__") + if potential_path: + return Path(potential_path).parent + return Path.cwd() + + def _create_default_session() -> Session: + """Create a default session to use the hooks and collect nodes.""" return Session( config={"check_casing_of_paths": True}, hook=get_plugin_manager().hook ) @@ -49,7 +58,7 @@ class DataCatalog: directory A directory where automatically created files are stored. entries - A collection of entries in the catalog. Entries can be :class:`pytask.PNode` or + A collection of entries in the catalog. Entries can be :class:`~pytask.PNode` or a :class:`DataCatalog` itself for nesting catalogs. name The name of the data catalog. Use it when you are working with multiple data @@ -62,10 +71,11 @@ class DataCatalog: entries: dict[str, DataCatalog | PNode] = Factory(dict) name: str = "default" _session: Session = Factory(_create_default_session) + _instance_path: Path = Factory(_get_parent_path_of_data_catalog_module) def __attrs_post_init__(self) -> None: if not self.directory: - root = _find_directory() + root = _find_directory(self._instance_path) self.directory = root / self.name self.directory.mkdir(parents=True, exist_ok=True) @@ -85,6 +95,8 @@ def __getitem__(self, name: str) -> DataCatalog | PNode: def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: """Add an entry to the data catalog.""" + assert isinstance(self.directory, Path) + if not isinstance(name, str): msg = "The name of a catalog entry must be a string." raise TypeError(msg) @@ -97,7 +109,7 @@ def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: ) else: self.entries[name] = self.default_node(name=name) # type: ignore[call-arg] - self.directory.joinpath(f"{filename}-node.pkl").write_bytes( # type: ignore[union-attr] + self.directory.joinpath(f"{filename}-node.pkl").write_bytes( pickle.dumps(self.entries[name]) ) elif isinstance(node, PNode): @@ -105,7 +117,7 @@ def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: else: collected_node = self._session.hook.pytask_collect_node( session=self._session, - path=None, + path=self._instance_path, node_info=NodeInfo( arg_name=name, path=(), value=node, task_path=None, task_name="" ), diff --git a/tests/test_data_catalog.py b/tests/test_data_catalog.py new file mode 100644 index 00000000..41ef6408 --- /dev/null +++ b/tests/test_data_catalog.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import textwrap +from pathlib import Path + +from pytask import cli +from pytask import DataCatalog +from pytask import ExitCode +from pytask import PathNode +from pytask import PickleNode +from pytask import PythonNode + + +def test_data_catalog_knows_path_where_it_is_defined(): + data_catalog = DataCatalog() + assert Path(__file__).parent == data_catalog._instance_path + + +def test_data_catalog_collects_nodes(): + data_catalog = DataCatalog() + + default_node = data_catalog["default_node"] + assert isinstance(default_node, PickleNode) + + data_catalog.add("node", Path("file.txt")) + assert isinstance(data_catalog["node"], PathNode) + + +def test_change_default_node(): + data_catalog = DataCatalog(default_node=PythonNode) + default_node = data_catalog["new_default_node"] + assert isinstance(default_node, PythonNode) + + +def test_use_data_catalog_in_workflow(runner, tmp_path): + source = """ + from pathlib import Path + from typing_extensions import Annotated + + from pytask import DataCatalog + + + _SRC = Path(__file__).parent.resolve() + + # Generate input data + _DataCatalog = DataCatalog() + _DataCatalog.add("file", _SRC / "file.txt") + _DataCatalog.add("output", _SRC / "output.txt") + + + def task_add_content( + path: Annotated[Path, _DataCatalog["file"]] + ) -> Annotated[str, _DataCatalog["new_content"]]: + text = path.read_text() + text += "World!" + return text + + + def task_save_text( + text: Annotated[str, _DataCatalog["new_content"]] + ) -> Annotated[str, _DataCatalog["output"]]: + return text + """ + tmp_path.joinpath("task_example.py").write_text(textwrap.dedent(source)) + tmp_path.joinpath("file.txt").write_text("Hello, ") + + result = runner.invoke(cli, [tmp_path.as_posix()]) + assert result.exit_code == ExitCode.OK + assert tmp_path.joinpath("output.txt").read_text() == "Hello, World!" From 9a14830f64faef6b5805ab28b3358e846aef9623 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Tue, 17 Oct 2023 22:06:04 +0200 Subject: [PATCH 09/28] better docs. --- .../source/_static/md/using-a-data-catalog.md | 26 ++++++ .../how_to_guides/using_a_data_catalog.md | 79 ++++++++++++++----- docs/source/reference_guides/api.md | 1 + tests/test_data_catalog.py | 6 +- 4 files changed, 90 insertions(+), 22 deletions(-) create mode 100644 docs/source/_static/md/using-a-data-catalog.md diff --git a/docs/source/_static/md/using-a-data-catalog.md b/docs/source/_static/md/using-a-data-catalog.md new file mode 100644 index 00000000..e9b1df83 --- /dev/null +++ b/docs/source/_static/md/using-a-data-catalog.md @@ -0,0 +1,26 @@ +
+ +```console + +$ pytask +──────────────────────────── Start pytask session ──────────────────────────── +Platform: win32 -- Python 3.10.0, pytask 0.3.0, pluggy 1.0.0 +Root: C:\Users\pytask-dev\git\my_project +Collected 1 task. + +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ +┃ Task ┃ Outcome ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ +│ task_data_catalog.pytask_add_content::task_add_content │ . │ +│ task_data_catalog.pytask_add_content::task_save_text │ . │ +└────────────────────────────────────────────────────────┴─────────┘ + +────────────────────────────────────────────────────────────────────────────── +╭─────────── Summary ────────────╮ + 2 Collected tasks + 2 Succeeded (100.0%) +╰────────────────────────────────╯ +───────────────────────── Succeeded in 0.06 seconds ────────────────────────── +``` + +
diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md index 46225c84..d19679b2 100644 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -1,11 +1,15 @@ # Using a data catalog -A data catalog is an inventory for data in your project. It has two main advantages: +A data catalog is an inventory for your project's data. You can add your data to the +catalog and then easily add them to your task functions. -- A data catalog provides an interface to easily access the data. -- A data catalog can take care of saving a task product. +The catalog can also handle data produced by your tasks automatically so that you do not +have to define any paths. -## Using the data catalog +Lastly, after data has been added to catalog, you can import the catalog in a script or +Jupyter notebook and load the data for exploration. + +## The `DataCatalog` As an example, we build a workflow comprising of two tasks that do the following actions. @@ -24,16 +28,13 @@ from pytask import DataCatalog from pytask import PathNode -# Get the path of the parent directory of the file. -ROOT = Path(__file__).parent.resolve() - - -# We store the data in .pytask/ -OurDataCatalog = DataCatalog(directory=ROOT / ".pytask") +# Create the data catalog. +data_catalog = DataCatalog() -# Register the input and the output data. -OurDataCatalog.add("input", ROOT / "input.txt") -OurDataCatalog.add("output", ROOT / "output.txt") +# Register the input and the output data. Paths are assumed to be +# relative to the module where the data catalog is instantiated. +data_catalog.add("input", Path("input.txt")) +data_catalog.add("output", Path("output.txt")) ``` We also have to create `input.txt` and add some content like `Hello, `. @@ -44,22 +45,64 @@ Next, let us define the two tasks. ```python def task_save_text_with_pickle( - path: Annotated[Path, OurDataCatalog["input"]] -) -> Annotated[str, OurDataCatalog["pickle_file"]]: + path: Annotated[Path, data_catalog["input"]] +) -> Annotated[str, data_catalog["pickle_file"]]: text = path.read_text() return text def task_add_content_and_save_text( - text: Annotated[str, OurDataCatalog["pickle_file"]] -) -> Annotated[str, OurDataCatalog["output"]]: + text: Annotated[str, data_catalog["pickle_file"]] +) -> Annotated[str, data_catalog["output"]]: text += "World!" return text ``` The important bit here is that we reference the intermediate pickle file with -`OurDataCatalog["pickle_file"]`. Since the entry does not exist, the catalog creates a +`data_catalog["pickle_file"]`. Since the entry does not exist, the catalog creates a {class}`~pytask.PickleNode` for this entry and saves the pickle file in the directory given to the {class}`~pytask.DataCatalog`. +Now, we can execute the tasks. + +```{include} ../_static/md/using-a-data-catalog.md +``` + +## Developing with the `DataCatalog` + +After you executed the workflow, you can import the data catalog in a Jupyter notebook +or in the terminal in the Python interpreter. Call the {meth}`~pytask.PNode.load` method +of a node to access its value. + +```pycon +>>> from task_data_catalog import data_catalog +>>> data_catalog.entries +['new_content', 'file', 'output'] +>>> data_catalog["new_content"].load() +'This is the text.World!' +>>> data_catalog["output"].load() +WindowsPath('C:\Users\pytask-dev\git\my_project\output.txt') +``` + +`data_catalog["new_content"]` was stored with a {class}`~pytask.PickleNode` and returns +text whereas {class}`pathlib.Path`s become {class}`~pytask.PathNode`s and return their +path. + ## Changing the default node + +The data catalog uses the {class}`~pytask.PickleNode` by default to serialize any kind +of Python object. You can use any other node that follows the {protocol}`~pytask.PNode` +protocol and register it when creating the data catalog. + +For example, use the {class}`~pytask.PythonNode` as the default. + +```python +from pytask import PythonNode + + +data_catalog = DataCatalog(default_node=PythonNode) +``` + +```{seealso} +If you want to learn more about custom nodes, read {doc}`writing_custom_nodes`. +``` diff --git a/docs/source/reference_guides/api.md b/docs/source/reference_guides/api.md index f412c391..aa6bb789 100644 --- a/docs/source/reference_guides/api.md +++ b/docs/source/reference_guides/api.md @@ -263,6 +263,7 @@ Nodes are the interface for different kinds of dependencies or products. ```{eval-rst} .. autoclass:: pytask.PathNode +.. autoclass:: pytask.PickleNode .. autoclass:: pytask.PythonNode ``` diff --git a/tests/test_data_catalog.py b/tests/test_data_catalog.py index 41ef6408..f26efa63 100644 --- a/tests/test_data_catalog.py +++ b/tests/test_data_catalog.py @@ -40,12 +40,10 @@ def test_use_data_catalog_in_workflow(runner, tmp_path): from pytask import DataCatalog - _SRC = Path(__file__).parent.resolve() - # Generate input data _DataCatalog = DataCatalog() - _DataCatalog.add("file", _SRC / "file.txt") - _DataCatalog.add("output", _SRC / "output.txt") + _DataCatalog.add("file", Path("file.txt")) + _DataCatalog.add("output", Path("output.txt")) def task_add_content( From 99dd59a8c4bf5697a02d20096067643ffd29ee9c Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Tue, 17 Oct 2023 23:24:21 +0200 Subject: [PATCH 10/28] more test. --- tests/test_data_catalog.py | 92 +++++++++++++++++++++++++++++++++++--- tests/test_debugging.py | 1 + 2 files changed, 86 insertions(+), 7 deletions(-) diff --git a/tests/test_data_catalog.py b/tests/test_data_catalog.py index f26efa63..5fbd1a47 100644 --- a/tests/test_data_catalog.py +++ b/tests/test_data_catalog.py @@ -1,8 +1,10 @@ from __future__ import annotations +import sys import textwrap from pathlib import Path +import pytest from pytask import cli from pytask import DataCatalog from pytask import ExitCode @@ -11,11 +13,21 @@ from pytask import PythonNode +try: + import pexpect +except ModuleNotFoundError: # pragma: no cover + IS_PEXPECT_INSTALLED = False +else: + IS_PEXPECT_INSTALLED = True + + +@pytest.mark.unit() def test_data_catalog_knows_path_where_it_is_defined(): data_catalog = DataCatalog() assert Path(__file__).parent == data_catalog._instance_path +@pytest.mark.unit() def test_data_catalog_collects_nodes(): data_catalog = DataCatalog() @@ -26,12 +38,14 @@ def test_data_catalog_collects_nodes(): assert isinstance(data_catalog["node"], PathNode) +@pytest.mark.unit() def test_change_default_node(): data_catalog = DataCatalog(default_node=PythonNode) default_node = data_catalog["new_default_node"] assert isinstance(default_node, PythonNode) +@pytest.mark.end_to_end() def test_use_data_catalog_in_workflow(runner, tmp_path): source = """ from pathlib import Path @@ -41,22 +55,22 @@ def test_use_data_catalog_in_workflow(runner, tmp_path): # Generate input data - _DataCatalog = DataCatalog() - _DataCatalog.add("file", Path("file.txt")) - _DataCatalog.add("output", Path("output.txt")) + data_catalog = DataCatalog() + data_catalog.add("file", Path("file.txt")) + data_catalog.add("output", Path("output.txt")) def task_add_content( - path: Annotated[Path, _DataCatalog["file"]] - ) -> Annotated[str, _DataCatalog["new_content"]]: + path: Annotated[Path, data_catalog["file"]] + ) -> Annotated[str, data_catalog["new_content"]]: text = path.read_text() text += "World!" return text def task_save_text( - text: Annotated[str, _DataCatalog["new_content"]] - ) -> Annotated[str, _DataCatalog["output"]]: + text: Annotated[str, data_catalog["new_content"]] + ) -> Annotated[str, data_catalog["output"]]: return text """ tmp_path.joinpath("task_example.py").write_text(textwrap.dedent(source)) @@ -65,3 +79,67 @@ def task_save_text( result = runner.invoke(cli, [tmp_path.as_posix()]) assert result.exit_code == ExitCode.OK assert tmp_path.joinpath("output.txt").read_text() == "Hello, World!" + + +@pytest.mark.end_to_end() +def test_use_data_catalog_w_config(runner, tmp_path): + source = """ + from pathlib import Path + from typing_extensions import Annotated + from pytask import DataCatalog + + data_catalog = DataCatalog() + + def task_add_content() -> Annotated[str, data_catalog["new_content"]]: + return "Hello, World!" + """ + tmp_path.joinpath("src", "tasks").mkdir(parents=True) + tmp_path.joinpath("src", "tasks", "task_example.py").write_text( + textwrap.dedent(source) + ) + tmp_path.joinpath("pyproject.toml").write_text("[tool.pytask.ini_options]") + + result = runner.invoke(cli, [tmp_path.as_posix()]) + assert result.exit_code == ExitCode.OK + assert ( + len(list(tmp_path.joinpath(".pytask", "data_catalogs", "default").iterdir())) + == 2 + ) + + +def _flush(child): + if child.isalive(): + child.read() + child.wait() + assert not child.isalive() + + +@pytest.mark.end_to_end() +@pytest.mark.skipif(not IS_PEXPECT_INSTALLED, reason="pexpect is not installed.") +@pytest.mark.skipif(sys.platform == "win32", reason="pexpect cannot spawn on Windows.") +def test_use_data_catalog_in_terminal(runner, tmp_path): + source = """ + from pathlib import Path + from typing_extensions import Annotated + from pytask import DataCatalog + + data_catalog = DataCatalog() + + def task_add_content() -> Annotated[str, data_catalog["new_content"]]: + return "Hello, World!" + """ + tmp_path.joinpath("task_example.py").write_text(textwrap.dedent(source)) + + result = runner.invoke(cli, [tmp_path.as_posix()]) + assert result.exit_code == ExitCode.OK + + child = pexpect.spawn("python") + child.sendline(f"import sys; sys.path.insert(0, {tmp_path.as_posix()!r});") + child.sendline("from task_example import data_catalog;") + child.sendline("data_catalog.entries;") + child.sendline("data_catalog['new_content'].load()") + child.sendline("exit()") + rest = child.read().decode("utf-8") + assert "new_content" in rest + assert "Hello, World!" in rest + _flush(child) diff --git a/tests/test_debugging.py b/tests/test_debugging.py index aece1cd7..f81bad79 100644 --- a/tests/test_debugging.py +++ b/tests/test_debugging.py @@ -12,6 +12,7 @@ from pytask import cli from pytask import ExitCode + try: import pexpect except ModuleNotFoundError: # pragma: no cover From 3e14cf895ea928013bfe0de888ba694a0edf73b9 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Fri, 20 Oct 2023 21:01:47 +0200 Subject: [PATCH 11/28] fix. --- docs/source/how_to_guides/using_a_data_catalog.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md index d19679b2..1b9da812 100644 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -103,6 +103,4 @@ from pytask import PythonNode data_catalog = DataCatalog(default_node=PythonNode) ``` -```{seealso} -If you want to learn more about custom nodes, read {doc}`writing_custom_nodes`. -``` +Or, learn to write your own node by reading {doc}`writing_custom_nodes`. From d3ccb4a894884392bde29b18b4e184d609c35daa Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 25 Oct 2023 22:19:59 +0200 Subject: [PATCH 12/28] extend guide. --- .pre-commit-config.yaml | 1 + .../how_to_guides/using_a_data_catalog.md | 51 +++++++++++++++++++ src/_pytask/data_catalog.py | 40 ++++++++------- tests/test_data_catalog.py | 44 ++++++++++++++++ 4 files changed, 118 insertions(+), 18 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 91362ac1..184dbba4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -106,6 +106,7 @@ repos: docs/source/how_to_guides/using_task_returns.md| docs/source/how_to_guides/writing_custom_nodes.md| docs/source/how_to_guides/hashing_inputs_of_tasks.md| + docs/source/how_to_guides/using_a_data_catalog.md| docs/source/reference_guides/hookspecs.md| docs/source/tutorials/configuration.md| docs/source/tutorials/debugging.md| diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md index 1b9da812..1fcb4eb2 100644 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -88,6 +88,13 @@ WindowsPath('C:\Users\pytask-dev\git\my_project\output.txt') text whereas {class}`pathlib.Path`s become {class}`~pytask.PathNode`s and return their path. +:::{info} +Whether the module `task_data_catalog.py` is importable depends on whether it is on your +`PYTHONPATH`, a variable that defines where modules can be found. It is easy, if you +develop your workflow as a Python package as explained in the tutorials. Then, you can +import the data catalog with, for example, `from myproject.config import data_catalog`. +::: + ## Changing the default node The data catalog uses the {class}`~pytask.PickleNode` by default to serialize any kind @@ -104,3 +111,47 @@ data_catalog = DataCatalog(default_node=PythonNode) ``` Or, learn to write your own node by reading {doc}`writing_custom_nodes`. + +## Changing the name and the default path + +By default, the data catalogs store their data in a directory `.pytask/data_catalogs`. +If you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the +`.pytask` folder is in the same folder as the configuration file. + +The default name for a catalog is `"default"` and so you will find its data in +`.pytask/data_catalogs/default`. If you assign a different name like +`"data_management"`, you will find the data in `.pytask/data_catalogs/data_management`. + +```python +data_catalog = DataCatalog(name="data_management") +``` + +You can also change the path where the data catalogs will be stored by changing the +`path` attribute. Here, we store the data catalog's data next to the module where the +data catalog is defined in `.data`. + +```python +from pathlib import Path + + +data_catalog = DataCatalog(path=Path(__file__).parent / ".data") +``` + +## Multiple data catalogs + +You can use multiple data catalogs when you want to separate your datasets across +multiple catalogs or when you want to use the same names multiple times (although it is +not recommended!). + +Make sure you assign different names to the data catalogs so that their data is stored +in different directories. + +```python +# Stored in .pytask/data_catalog/a +data_catalog = DataCatalog(name="a") + +# Stored in .pytask/data_catalog/b +data_catalog = DataCatalog(name="b") +``` + +Or, use different paths as explained above. diff --git a/src/_pytask/data_catalog.py b/src/_pytask/data_catalog.py index 62fcc9e2..e2ece949 100644 --- a/src/_pytask/data_catalog.py +++ b/src/_pytask/data_catalog.py @@ -19,14 +19,14 @@ from _pytask.pluginmanager import get_plugin_manager from _pytask.session import Session from attrs import define -from attrs import Factory +from attrs import field __all__ = ["DataCatalog"] -def _find_directory(path: Path) -> Path: - """Find directory where data catalog can store its data.""" +def _find_root_path(path: Path) -> Path: + """Find path where data catalog can store its data.""" root_path, _ = find_project_root_and_config([path]) return root_path.joinpath(".pytask", "data_catalogs") @@ -54,36 +54,40 @@ class DataCatalog: Parameters ---------- default_node - A default node for loading and saving values. - directory - A directory where automatically created files are stored. + A default node for loading and saving values. By default, + :class:`~pytask.PickleNode` is used to serialize any Python object with the + :mod:`pickle` module. entries A collection of entries in the catalog. Entries can be :class:`~pytask.PNode` or a :class:`DataCatalog` itself for nesting catalogs. name The name of the data catalog. Use it when you are working with multiple data catalogs that store data under the same keys. + path + A path where automatically created files are stored. By default, it will be + ``.pytask/data_catalogs/default``. """ default_node: type[PNode] = PickleNode - directory: Path | None = None - entries: dict[str, DataCatalog | PNode] = Factory(dict) + entries: dict[str, PNode] = field(factory=dict) name: str = "default" - _session: Session = Factory(_create_default_session) - _instance_path: Path = Factory(_get_parent_path_of_data_catalog_module) + path: Path | None = None + _session: Session = field(factory=_create_default_session) + _instance_path: Path = field(factory=_get_parent_path_of_data_catalog_module) def __attrs_post_init__(self) -> None: - if not self.directory: - root = _find_directory(self._instance_path) - self.directory = root / self.name - self.directory.mkdir(parents=True, exist_ok=True) + if not self.path: + root = _find_root_path(self._instance_path) + self.path = root / self.name + + self.path.mkdir(parents=True, exist_ok=True) self._initialize() def _initialize(self) -> None: """Initialize the data catalog with persisted nodes from previous runs.""" - for path in self.directory.glob("*-node.pkl"): # type: ignore[union-attr] + for path in self.path.glob("*-node.pkl"): # type: ignore[union-attr] node = pickle.loads(path.read_bytes()) # noqa: S301 self.entries[node.name] = node @@ -95,7 +99,7 @@ def __getitem__(self, name: str) -> DataCatalog | PNode: def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: """Add an entry to the data catalog.""" - assert isinstance(self.directory, Path) + assert isinstance(self.path, Path) if not isinstance(name, str): msg = "The name of a catalog entry must be a string." @@ -105,11 +109,11 @@ def add(self, name: str, node: DataCatalog | PNode | None = None) -> None: filename = str(hashlib.sha256(name.encode()).hexdigest()) if isinstance(self.default_node, PPathNode): self.entries[name] = self.default_node( - name=name, path=self.directory / f"{filename}.pkl" + name=name, path=self.path / f"{filename}.pkl" ) else: self.entries[name] = self.default_node(name=name) # type: ignore[call-arg] - self.directory.joinpath(f"{filename}-node.pkl").write_bytes( + self.path.joinpath(f"{filename}-node.pkl").write_bytes( pickle.dumps(self.entries[name]) ) elif isinstance(node, PNode): diff --git a/tests/test_data_catalog.py b/tests/test_data_catalog.py index 5fbd1a47..3e5dbfd6 100644 --- a/tests/test_data_catalog.py +++ b/tests/test_data_catalog.py @@ -79,6 +79,10 @@ def task_save_text( result = runner.invoke(cli, [tmp_path.as_posix()]) assert result.exit_code == ExitCode.OK assert tmp_path.joinpath("output.txt").read_text() == "Hello, World!" + assert ( + len(list(tmp_path.joinpath(".pytask", "data_catalogs", "default").iterdir())) + == 2 + ) @pytest.mark.end_to_end() @@ -143,3 +147,43 @@ def task_add_content() -> Annotated[str, data_catalog["new_content"]]: assert "new_content" in rest assert "Hello, World!" in rest _flush(child) + + +@pytest.mark.end_to_end() +def test_use_data_catalog_with_different_name(runner, tmp_path): + source = """ + from pathlib import Path + from typing_extensions import Annotated + from pytask import DataCatalog + + data_catalog = DataCatalog(name="blob") + + def task_add_content() -> Annotated[str, data_catalog["new_content"]]: + return "Hello, World!" + """ + tmp_path.joinpath("task_example.py").write_text(textwrap.dedent(source)) + + result = runner.invoke(cli, [tmp_path.as_posix()]) + assert result.exit_code == ExitCode.OK + assert ( + len(list(tmp_path.joinpath(".pytask", "data_catalogs", "blob").iterdir())) == 2 + ) + + +@pytest.mark.end_to_end() +def test_use_data_catalog_with_different_path(runner, tmp_path): + source = """ + from pathlib import Path + from typing_extensions import Annotated + from pytask import DataCatalog + + data_catalog = DataCatalog(name="blob", path=Path(__file__).parent / ".data") + + def task_add_content() -> Annotated[str, data_catalog["new_content"]]: + return "Hello, World!" + """ + tmp_path.joinpath("task_example.py").write_text(textwrap.dedent(source)) + + result = runner.invoke(cli, [tmp_path.as_posix()]) + assert result.exit_code == ExitCode.OK + assert len(list(tmp_path.joinpath(".data").iterdir())) == 2 From e018a2c7bbe8fa12edc7d7beb0a8cc4bb951b84d Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 25 Oct 2023 22:35:46 +0200 Subject: [PATCH 13/28] fix. --- docs/source/how_to_guides/using_a_data_catalog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md index 1fcb4eb2..65a68a31 100644 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -88,7 +88,7 @@ WindowsPath('C:\Users\pytask-dev\git\my_project\output.txt') text whereas {class}`pathlib.Path`s become {class}`~pytask.PathNode`s and return their path. -:::{info} +:::{note} Whether the module `task_data_catalog.py` is importable depends on whether it is on your `PYTHONPATH`, a variable that defines where modules can be found. It is easy, if you develop your workflow as a Python package as explained in the tutorials. Then, you can From 552c8e82954b92e7d601e4918e8e47d643738b12 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Thu, 26 Oct 2023 00:18:52 +0200 Subject: [PATCH 14/28] Fix version. --- docs/source/_static/md/using-a-data-catalog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/_static/md/using-a-data-catalog.md b/docs/source/_static/md/using-a-data-catalog.md index e9b1df83..837ee432 100644 --- a/docs/source/_static/md/using-a-data-catalog.md +++ b/docs/source/_static/md/using-a-data-catalog.md @@ -4,7 +4,7 @@ $ pytask ──────────────────────────── Start pytask session ──────────────────────────── -Platform: win32 -- Python 3.10.0, pytask 0.3.0, pluggy 1.0.0 +Platform: win32 -- Python 3.10.0, pytask 0.4.2, pluggy 1.0.0 Root: C:\Users\pytask-dev\git\my_project Collected 1 task. From 9208e3fc31d6ce797a9a168759d33904903fc102 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Fri, 27 Oct 2023 19:48:01 +0200 Subject: [PATCH 15/28] add more tests. --- pyproject.toml | 1 + src/_pytask/_hashlib.py | 209 ++++++++++++++++++++++++++++++++++++++++ src/_pytask/nodes.py | 11 +++ src/_pytask/path.py | 9 ++ tests/test_nodes.py | 55 ++++++++++- 5 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 src/_pytask/_hashlib.py diff --git a/pyproject.toml b/pyproject.toml index ae7c0887..d0fc7df0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -125,6 +125,7 @@ ignore = [ [tool.ruff.per-file-ignores] +"src/_pytask/_hashlib.py" = ["ALL"] "src/_pytask/capture.py" = ["PGH003"] "src/_pytask/hookspecs.py" = ["ARG001"] "src/_pytask/outcomes.py" = ["N818"] diff --git a/src/_pytask/_hashlib.py b/src/_pytask/_hashlib.py new file mode 100644 index 00000000..6e2f43f2 --- /dev/null +++ b/src/_pytask/_hashlib.py @@ -0,0 +1,209 @@ +from __future__ import annotations + +import sys + + +if sys.version_info >= (3, 11): + from hashlib import file_digest +else: + # This tuple and __get_builtin_constructor() must be modified if a new + # always available algorithm is added. + __always_supported = ( + "md5", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "blake2b", + "blake2s", + "sha3_224", + "sha3_256", + "sha3_384", + "sha3_512", + "shake_128", + "shake_256", + ) + + algorithms_guaranteed = set(__always_supported) + algorithms_available = set(__always_supported) + + __all__ = __always_supported + ( + "new", + "algorithms_guaranteed", + "algorithms_available", + "file_digest", + ) + + __builtin_constructor_cache = {} + + # Prefer our blake2 implementation + # OpenSSL 1.1.0 comes with a limited implementation of blake2b/s. The OpenSSL + # implementations neither support keyed blake2 (blake2 MAC) nor advanced + # features like salt, personalization, or tree hashing. OpenSSL hash-only + # variants are available as 'blake2b512' and 'blake2s256', though. + __block_openssl_constructor = { + "blake2b", + "blake2s", + } + + def __get_builtin_constructor(name): + cache = __builtin_constructor_cache + constructor = cache.get(name) + if constructor is not None: + return constructor + + with suppress(ImportError): + if name in {"SHA1", "sha1"}: + import _sha1 + + cache["SHA1"] = cache["sha1"] = _sha1.sha1 + elif name in {"MD5", "md5"}: + import _md5 + + cache["MD5"] = cache["md5"] = _md5.md5 + elif name in {"SHA256", "sha256", "SHA224", "sha224"}: + try: + import _sha2 + except ImportError: + import _sha256 as _sha2 + cache["SHA224"] = cache["sha224"] = _sha2.sha224 + cache["SHA256"] = cache["sha256"] = _sha2.sha256 + elif name in {"SHA512", "sha512", "SHA384", "sha384"}: + try: + import _sha2 + except ImportError: + import _sha256 as _sha2 + cache["SHA384"] = cache["sha384"] = _sha2.sha384 + cache["SHA512"] = cache["sha512"] = _sha2.sha512 + elif name in {"blake2b", "blake2s"}: + import _blake2 + + cache["blake2b"] = _blake2.blake2b + cache["blake2s"] = _blake2.blake2s + elif name in {"sha3_224", "sha3_256", "sha3_384", "sha3_512"}: + import _sha3 + + cache["sha3_224"] = _sha3.sha3_224 + cache["sha3_256"] = _sha3.sha3_256 + cache["sha3_384"] = _sha3.sha3_384 + cache["sha3_512"] = _sha3.sha3_512 + elif name in {"shake_128", "shake_256"}: + import _sha3 + + cache["shake_128"] = _sha3.shake_128 + cache["shake_256"] = _sha3.shake_256 + + constructor = cache.get(name) + if constructor is not None: + return constructor + + raise ValueError("unsupported hash type " + name) + + def __get_openssl_constructor(name): + if name in __block_openssl_constructor: + # Prefer our builtin blake2 implementation. + return __get_builtin_constructor(name) + try: + # MD5, SHA1, and SHA2 are in all supported OpenSSL versions + # SHA3/shake are available in OpenSSL 1.1.1+ + f = getattr(_hashlib, "openssl_" + name) + # Allow the C module to raise ValueError. The function will be + # defined but the hash not actually available. Don't fall back to + # builtin if the current security policy blocks a digest, bpo#40695. + f(usedforsecurity=False) + # Use the C function directly (very fast) + return f + except (AttributeError, ValueError): + return __get_builtin_constructor(name) + + def __py_new(name, data=b"", **kwargs): + """new(name, data=b'', **kwargs) - Return a new hashing object using the + named algorithm; optionally initialized with data (which must be + a bytes-like object). + """ + return __get_builtin_constructor(name)(data, **kwargs) + + def __hash_new(name, data=b"", **kwargs): + """new(name, data=b'') - Return a new hashing object using the named algorithm; + optionally initialized with data (which must be a bytes-like object). + """ + if name in __block_openssl_constructor: + # Prefer our builtin blake2 implementation. + return __get_builtin_constructor(name)(data, **kwargs) + try: + return _hashlib.new(name, data, **kwargs) + except ValueError: + # If the _hashlib module (OpenSSL) doesn't support the named + # hash, try using our builtin implementations. + # This allows for SHA224/256 and SHA384/512 support even though + # the OpenSSL library prior to 0.9.8 doesn't provide them. + return __get_builtin_constructor(name)(data) + + try: + import _hashlib + + new = __hash_new + __get_hash = __get_openssl_constructor + algorithms_available = algorithms_available.union( + _hashlib.openssl_md_meth_names + ) + except ImportError: + _hashlib = None + new = __py_new + __get_hash = __get_builtin_constructor + + with suppress(ImportError): + # OpenSSL's PKCS5_PBKDF2_HMAC requires OpenSSL 1.0+ with HMAC and SHA + from _hashlib import pbkdf2_hmac + + __all__ += ("pbkdf2_hmac",) + + with suppress(ImportError): + # OpenSSL's scrypt requires OpenSSL 1.1+ + from _hashlib import scrypt + + def file_digest(fileobj, digest, /, *, _bufsize=2**18): + """Hash the contents of a file-like object. Returns a digest object. + + *fileobj* must be a file-like object opened for reading in binary mode. + It accepts file objects from open(), io.BytesIO(), and SocketIO objects. + The function may bypass Python's I/O and use the file descriptor *fileno* + directly. + + *digest* must either be a hash algorithm name as a *str*, a hash + constructor, or a callable that returns a hash object. + """ + # On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy + # hashing with hardware acceleration. + if isinstance(digest, str): + digestobj = new(digest) + else: + digestobj = digest() + + if hasattr(fileobj, "getbuffer"): + # io.BytesIO object, use zero-copy buffer + digestobj.update(fileobj.getbuffer()) + return digestobj + + # Only binary files implement readinto(). + if not ( + hasattr(fileobj, "readinto") + and hasattr(fileobj, "readable") + and fileobj.readable() + ): + raise ValueError( + f"'{fileobj!r}' is not a file-like object in binary reading mode." + ) + + # binary file, socket.SocketIO object + # Note: socket I/O uses different syscalls than file I/O. + buf = bytearray(_bufsize) # Reusable buffer to reduce allocations. + view = memoryview(buf) + while True: + size = fileobj.readinto(buf) + if size == 0: + break # EOF + digestobj.update(view[:size]) + + return digestobj diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index d93615cc..2c3a805a 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -14,6 +14,7 @@ from _pytask.node_protocols import PPathNode from _pytask.node_protocols import PTask from _pytask.node_protocols import PTaskWithPath +from _pytask.path import hash_path from _pytask.typing import no_default from _pytask.typing import NoDefault from attrs import define @@ -145,11 +146,14 @@ class PathNode(PPathNode): Name of the node which makes it identifiable in the DAG. path The path to the file. + hash + Whether the file should be hashed to determine the state. """ name: str path: Path + hash: bool = False # noqa: A003 @classmethod @functools.lru_cache @@ -170,6 +174,8 @@ def state(self) -> str | None: The state is given by the modification timestamp. """ + if self.hash and self.path.exists(): + return str(hash_path(self.path, "sha256")) if self.path.exists(): return str(self.path.stat().st_mtime) return None @@ -275,6 +281,8 @@ class PickleNode: A function to convert :obj:`bytes` from a pickle file to a Python object. dump_func A function to convert a Python object to :obj:`bytes`. + hash + Whether the file should be hashed to determine the state. """ @@ -282,6 +290,7 @@ class PickleNode: path: Path load_func: Callable[[bytes], Any] = pickle.loads dump_func: Callable[[Any], bytes] = pickle.dumps + hash: bool = False # noqa: A003 @classmethod @functools.lru_cache @@ -297,6 +306,8 @@ def from_path(cls, path: Path) -> PickleNode: return cls(name=path.as_posix(), path=path) def state(self) -> str | None: + if self.hash and self.path.exists(): + return str(hash_path(self.path, "sha256")) if self.path.exists(): return str(self.path.stat().st_mtime) return None diff --git a/src/_pytask/path.py b/src/_pytask/path.py index 9399f830..34b8524a 100644 --- a/src/_pytask/path.py +++ b/src/_pytask/path.py @@ -10,6 +10,8 @@ from types import ModuleType from typing import Sequence +from _pytask._hashlib import file_digest + __all__ = [ "find_case_sensitive_path", @@ -206,3 +208,10 @@ def shorten_path(path: Path, paths: Sequence[Path]) -> str: ancestor = path.parents[-1] return relative_to(path, ancestor).as_posix() + + +def hash_path(path: Path, digest: str) -> str: + """Compute the hash of a file.""" + with path.open("rb") as f: + hash_ = file_digest(f, digest) + return hash_.hexdigest() diff --git a/tests/test_nodes.py b/tests/test_nodes.py index 8642a23d..0b4b3e09 100644 --- a/tests/test_nodes.py +++ b/tests/test_nodes.py @@ -1,7 +1,11 @@ from __future__ import annotations +import pickle + import pytest -from _pytask.nodes import PythonNode +from pytask import PathNode +from pytask import PickleNode +from pytask import PythonNode @pytest.mark.unit() @@ -19,3 +23,52 @@ def test_hash_of_python_node(value, hash_, expected): node = PythonNode(name="test", value=value, hash=hash_) state = node.state() assert state == expected + + +@pytest.mark.unit() +@pytest.mark.parametrize( + ("value", "hash_", "expected"), + [ + ("0", False, "0"), + ( + "0", + lambda x: 1, # noqa: ARG005 + "5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9", + ), + ("0", True, "5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9"), + ], +) +def test_hash_of_path_node(tmp_path, value, hash_, expected): + path = tmp_path.joinpath("text.txt") + path.write_text(value) + node = PathNode(name="test", path=path, hash=hash_) + state = node.state() + if hash_: + assert state == expected + else: + assert isinstance(state, str) + + +@pytest.mark.unit() +@pytest.mark.parametrize( + ("value", "hash_", "expected"), + [ + ("0", False, "0"), + ("0", True, "ac57d54dcc34f2ebf6f410f6d7fab436eb84f8f6b640782134b3d8062ebf71d0"), + ( + "0", + lambda x: 1, # noqa: ARG005 + "ac57d54dcc34f2ebf6f410f6d7fab436eb84f8f6b640782134b3d8062ebf71d0", + ), + ("0", True, "2e81f502b7a28f824c4f1451c946b952eebe65a8521925ef8f6135ef6f422e8e"), + ], +) +def test_hash_of_pickle_node(tmp_path, value, hash_, expected): + path = tmp_path.joinpath("text.pkl") + path.write_bytes(pickle.dumps(value)) + node = PickleNode(name="test", path=path, hash=hash_) + state = node.state() + if hash_: + assert state == expected + else: + assert isinstance(state, str) From 391fc23f3a41cdd4f1a8c3664b675868df5cb417 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Fri, 27 Oct 2023 19:57:24 +0200 Subject: [PATCH 16/28] fix. --- src/_pytask/_hashlib.py | 1 + tests/test_nodes.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/_pytask/_hashlib.py b/src/_pytask/_hashlib.py index 6e2f43f2..072e40bc 100644 --- a/src/_pytask/_hashlib.py +++ b/src/_pytask/_hashlib.py @@ -1,6 +1,7 @@ from __future__ import annotations import sys +from contextlib import suppress if sys.version_info >= (3, 11): diff --git a/tests/test_nodes.py b/tests/test_nodes.py index 0b4b3e09..f0933dd3 100644 --- a/tests/test_nodes.py +++ b/tests/test_nodes.py @@ -54,11 +54,11 @@ def test_hash_of_path_node(tmp_path, value, hash_, expected): ("value", "hash_", "expected"), [ ("0", False, "0"), - ("0", True, "ac57d54dcc34f2ebf6f410f6d7fab436eb84f8f6b640782134b3d8062ebf71d0"), + (0, True, "ac57d54dcc34f2ebf6f410f6d7fab436eb84f8f6b640782134b3d8062ebf71d0"), ( "0", lambda x: 1, # noqa: ARG005 - "ac57d54dcc34f2ebf6f410f6d7fab436eb84f8f6b640782134b3d8062ebf71d0", + "2e81f502b7a28f824c4f1451c946b952eebe65a8521925ef8f6135ef6f422e8e", ), ("0", True, "2e81f502b7a28f824c4f1451c946b952eebe65a8521925ef8f6135ef6f422e8e"), ], From adad36ae20b3770445a92e8026d9218e948343ce Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 28 Oct 2023 00:42:34 +0200 Subject: [PATCH 17/28] Revert some changes. --- src/_pytask/nodes.py | 10 --------- tests/test_nodes.py | 53 -------------------------------------------- 2 files changed, 63 deletions(-) diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index 2c3a805a..d1619c48 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -14,7 +14,6 @@ from _pytask.node_protocols import PPathNode from _pytask.node_protocols import PTask from _pytask.node_protocols import PTaskWithPath -from _pytask.path import hash_path from _pytask.typing import no_default from _pytask.typing import NoDefault from attrs import define @@ -146,14 +145,11 @@ class PathNode(PPathNode): Name of the node which makes it identifiable in the DAG. path The path to the file. - hash - Whether the file should be hashed to determine the state. """ name: str path: Path - hash: bool = False # noqa: A003 @classmethod @functools.lru_cache @@ -174,8 +170,6 @@ def state(self) -> str | None: The state is given by the modification timestamp. """ - if self.hash and self.path.exists(): - return str(hash_path(self.path, "sha256")) if self.path.exists(): return str(self.path.stat().st_mtime) return None @@ -281,8 +275,6 @@ class PickleNode: A function to convert :obj:`bytes` from a pickle file to a Python object. dump_func A function to convert a Python object to :obj:`bytes`. - hash - Whether the file should be hashed to determine the state. """ @@ -306,8 +298,6 @@ def from_path(cls, path: Path) -> PickleNode: return cls(name=path.as_posix(), path=path) def state(self) -> str | None: - if self.hash and self.path.exists(): - return str(hash_path(self.path, "sha256")) if self.path.exists(): return str(self.path.stat().st_mtime) return None diff --git a/tests/test_nodes.py b/tests/test_nodes.py index f0933dd3..5a70e7f4 100644 --- a/tests/test_nodes.py +++ b/tests/test_nodes.py @@ -1,10 +1,6 @@ from __future__ import annotations -import pickle - import pytest -from pytask import PathNode -from pytask import PickleNode from pytask import PythonNode @@ -23,52 +19,3 @@ def test_hash_of_python_node(value, hash_, expected): node = PythonNode(name="test", value=value, hash=hash_) state = node.state() assert state == expected - - -@pytest.mark.unit() -@pytest.mark.parametrize( - ("value", "hash_", "expected"), - [ - ("0", False, "0"), - ( - "0", - lambda x: 1, # noqa: ARG005 - "5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9", - ), - ("0", True, "5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9"), - ], -) -def test_hash_of_path_node(tmp_path, value, hash_, expected): - path = tmp_path.joinpath("text.txt") - path.write_text(value) - node = PathNode(name="test", path=path, hash=hash_) - state = node.state() - if hash_: - assert state == expected - else: - assert isinstance(state, str) - - -@pytest.mark.unit() -@pytest.mark.parametrize( - ("value", "hash_", "expected"), - [ - ("0", False, "0"), - (0, True, "ac57d54dcc34f2ebf6f410f6d7fab436eb84f8f6b640782134b3d8062ebf71d0"), - ( - "0", - lambda x: 1, # noqa: ARG005 - "2e81f502b7a28f824c4f1451c946b952eebe65a8521925ef8f6135ef6f422e8e", - ), - ("0", True, "2e81f502b7a28f824c4f1451c946b952eebe65a8521925ef8f6135ef6f422e8e"), - ], -) -def test_hash_of_pickle_node(tmp_path, value, hash_, expected): - path = tmp_path.joinpath("text.pkl") - path.write_bytes(pickle.dumps(value)) - node = PickleNode(name="test", path=path, hash=hash_) - state = node.state() - if hash_: - assert state == expected - else: - assert isinstance(state, str) From a7f8c78e1d0ff091e8781ca1400f390d58e35e75 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 28 Oct 2023 01:56:34 +0200 Subject: [PATCH 18/28] Fix. --- src/_pytask/nodes.py | 1 - src/_pytask/path.py | 9 --------- 2 files changed, 10 deletions(-) diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index d1619c48..d93615cc 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -282,7 +282,6 @@ class PickleNode: path: Path load_func: Callable[[bytes], Any] = pickle.loads dump_func: Callable[[Any], bytes] = pickle.dumps - hash: bool = False # noqa: A003 @classmethod @functools.lru_cache diff --git a/src/_pytask/path.py b/src/_pytask/path.py index 34b8524a..9399f830 100644 --- a/src/_pytask/path.py +++ b/src/_pytask/path.py @@ -10,8 +10,6 @@ from types import ModuleType from typing import Sequence -from _pytask._hashlib import file_digest - __all__ = [ "find_case_sensitive_path", @@ -208,10 +206,3 @@ def shorten_path(path: Path, paths: Sequence[Path]) -> str: ancestor = path.parents[-1] return relative_to(path, ancestor).as_posix() - - -def hash_path(path: Path, digest: str) -> str: - """Compute the hash of a file.""" - with path.open("rb") as f: - hash_ = file_digest(f, digest) - return hash_.hexdigest() From f5028893fc0e1323f5a99b95ee53a2af4526ce3c Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 28 Oct 2023 01:57:36 +0200 Subject: [PATCH 19/28] Remove unnecessary files. --- src/_pytask/_hashlib.py | 210 ---------------------------------------- 1 file changed, 210 deletions(-) delete mode 100644 src/_pytask/_hashlib.py diff --git a/src/_pytask/_hashlib.py b/src/_pytask/_hashlib.py deleted file mode 100644 index 072e40bc..00000000 --- a/src/_pytask/_hashlib.py +++ /dev/null @@ -1,210 +0,0 @@ -from __future__ import annotations - -import sys -from contextlib import suppress - - -if sys.version_info >= (3, 11): - from hashlib import file_digest -else: - # This tuple and __get_builtin_constructor() must be modified if a new - # always available algorithm is added. - __always_supported = ( - "md5", - "sha1", - "sha224", - "sha256", - "sha384", - "sha512", - "blake2b", - "blake2s", - "sha3_224", - "sha3_256", - "sha3_384", - "sha3_512", - "shake_128", - "shake_256", - ) - - algorithms_guaranteed = set(__always_supported) - algorithms_available = set(__always_supported) - - __all__ = __always_supported + ( - "new", - "algorithms_guaranteed", - "algorithms_available", - "file_digest", - ) - - __builtin_constructor_cache = {} - - # Prefer our blake2 implementation - # OpenSSL 1.1.0 comes with a limited implementation of blake2b/s. The OpenSSL - # implementations neither support keyed blake2 (blake2 MAC) nor advanced - # features like salt, personalization, or tree hashing. OpenSSL hash-only - # variants are available as 'blake2b512' and 'blake2s256', though. - __block_openssl_constructor = { - "blake2b", - "blake2s", - } - - def __get_builtin_constructor(name): - cache = __builtin_constructor_cache - constructor = cache.get(name) - if constructor is not None: - return constructor - - with suppress(ImportError): - if name in {"SHA1", "sha1"}: - import _sha1 - - cache["SHA1"] = cache["sha1"] = _sha1.sha1 - elif name in {"MD5", "md5"}: - import _md5 - - cache["MD5"] = cache["md5"] = _md5.md5 - elif name in {"SHA256", "sha256", "SHA224", "sha224"}: - try: - import _sha2 - except ImportError: - import _sha256 as _sha2 - cache["SHA224"] = cache["sha224"] = _sha2.sha224 - cache["SHA256"] = cache["sha256"] = _sha2.sha256 - elif name in {"SHA512", "sha512", "SHA384", "sha384"}: - try: - import _sha2 - except ImportError: - import _sha256 as _sha2 - cache["SHA384"] = cache["sha384"] = _sha2.sha384 - cache["SHA512"] = cache["sha512"] = _sha2.sha512 - elif name in {"blake2b", "blake2s"}: - import _blake2 - - cache["blake2b"] = _blake2.blake2b - cache["blake2s"] = _blake2.blake2s - elif name in {"sha3_224", "sha3_256", "sha3_384", "sha3_512"}: - import _sha3 - - cache["sha3_224"] = _sha3.sha3_224 - cache["sha3_256"] = _sha3.sha3_256 - cache["sha3_384"] = _sha3.sha3_384 - cache["sha3_512"] = _sha3.sha3_512 - elif name in {"shake_128", "shake_256"}: - import _sha3 - - cache["shake_128"] = _sha3.shake_128 - cache["shake_256"] = _sha3.shake_256 - - constructor = cache.get(name) - if constructor is not None: - return constructor - - raise ValueError("unsupported hash type " + name) - - def __get_openssl_constructor(name): - if name in __block_openssl_constructor: - # Prefer our builtin blake2 implementation. - return __get_builtin_constructor(name) - try: - # MD5, SHA1, and SHA2 are in all supported OpenSSL versions - # SHA3/shake are available in OpenSSL 1.1.1+ - f = getattr(_hashlib, "openssl_" + name) - # Allow the C module to raise ValueError. The function will be - # defined but the hash not actually available. Don't fall back to - # builtin if the current security policy blocks a digest, bpo#40695. - f(usedforsecurity=False) - # Use the C function directly (very fast) - return f - except (AttributeError, ValueError): - return __get_builtin_constructor(name) - - def __py_new(name, data=b"", **kwargs): - """new(name, data=b'', **kwargs) - Return a new hashing object using the - named algorithm; optionally initialized with data (which must be - a bytes-like object). - """ - return __get_builtin_constructor(name)(data, **kwargs) - - def __hash_new(name, data=b"", **kwargs): - """new(name, data=b'') - Return a new hashing object using the named algorithm; - optionally initialized with data (which must be a bytes-like object). - """ - if name in __block_openssl_constructor: - # Prefer our builtin blake2 implementation. - return __get_builtin_constructor(name)(data, **kwargs) - try: - return _hashlib.new(name, data, **kwargs) - except ValueError: - # If the _hashlib module (OpenSSL) doesn't support the named - # hash, try using our builtin implementations. - # This allows for SHA224/256 and SHA384/512 support even though - # the OpenSSL library prior to 0.9.8 doesn't provide them. - return __get_builtin_constructor(name)(data) - - try: - import _hashlib - - new = __hash_new - __get_hash = __get_openssl_constructor - algorithms_available = algorithms_available.union( - _hashlib.openssl_md_meth_names - ) - except ImportError: - _hashlib = None - new = __py_new - __get_hash = __get_builtin_constructor - - with suppress(ImportError): - # OpenSSL's PKCS5_PBKDF2_HMAC requires OpenSSL 1.0+ with HMAC and SHA - from _hashlib import pbkdf2_hmac - - __all__ += ("pbkdf2_hmac",) - - with suppress(ImportError): - # OpenSSL's scrypt requires OpenSSL 1.1+ - from _hashlib import scrypt - - def file_digest(fileobj, digest, /, *, _bufsize=2**18): - """Hash the contents of a file-like object. Returns a digest object. - - *fileobj* must be a file-like object opened for reading in binary mode. - It accepts file objects from open(), io.BytesIO(), and SocketIO objects. - The function may bypass Python's I/O and use the file descriptor *fileno* - directly. - - *digest* must either be a hash algorithm name as a *str*, a hash - constructor, or a callable that returns a hash object. - """ - # On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy - # hashing with hardware acceleration. - if isinstance(digest, str): - digestobj = new(digest) - else: - digestobj = digest() - - if hasattr(fileobj, "getbuffer"): - # io.BytesIO object, use zero-copy buffer - digestobj.update(fileobj.getbuffer()) - return digestobj - - # Only binary files implement readinto(). - if not ( - hasattr(fileobj, "readinto") - and hasattr(fileobj, "readable") - and fileobj.readable() - ): - raise ValueError( - f"'{fileobj!r}' is not a file-like object in binary reading mode." - ) - - # binary file, socket.SocketIO object - # Note: socket I/O uses different syscalls than file I/O. - buf = bytearray(_bufsize) # Reusable buffer to reduce allocations. - view = memoryview(buf) - while True: - size = fileobj.readinto(buf) - if size == 0: - break # EOF - digestobj.update(view[:size]) - - return digestobj From 3a5d01ff4f10ccd1d3e9292ebb2553b519227f8d Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 28 Oct 2023 09:55:19 +0200 Subject: [PATCH 20/28] Better description. --- .../how_to_guides/using_a_data_catalog.md | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md index 65a68a31..39c095da 100644 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -18,14 +18,21 @@ actions. 1. Read the data from pickle, adding additional text and storing it as a text file under `output.txt`. -At first, we build the data catalog by registering the data that we provide or that we -later want to access. +At first, we create the file `input.txt` and add some content. + +```ipython +>>> from pathlib import Path +>>> Path("input.txt").write_text("Hello, ") +``` + +Secondly, we build the data catalog by registering the data that we provide or that we +later want to access. You can add nodes, objects that are converted to nodes like +`~pathlib.Path` or any other object. ```python from pathlib import Path from pytask import DataCatalog -from pytask import PathNode # Create the data catalog. @@ -37,10 +44,6 @@ data_catalog.add("input", Path("input.txt")) data_catalog.add("output", Path("output.txt")) ``` -We also have to create `input.txt` and add some content like `Hello, `. - -We do not register the intermediate pickle file, yet. - Next, let us define the two tasks. ```python @@ -60,8 +63,8 @@ def task_add_content_and_save_text( The important bit here is that we reference the intermediate pickle file with `data_catalog["pickle_file"]`. Since the entry does not exist, the catalog creates a -{class}`~pytask.PickleNode` for this entry and saves the pickle file in the directory -given to the {class}`~pytask.DataCatalog`. +{class}`~pytask.PickleNode` for this entry and saves the pickle file in a `.pytask` +directory. Now, we can execute the tasks. @@ -77,22 +80,22 @@ of a node to access its value. ```pycon >>> from task_data_catalog import data_catalog >>> data_catalog.entries -['new_content', 'file', 'output'] ->>> data_catalog["new_content"].load() -'This is the text.World!' +['pickle_file', 'input', 'output'] +>>> data_catalog["pickle_file"].load() +'Hello, World!' >>> data_catalog["output"].load() WindowsPath('C:\Users\pytask-dev\git\my_project\output.txt') ``` -`data_catalog["new_content"]` was stored with a {class}`~pytask.PickleNode` and returns +`data_catalog["pickle_file"]` was stored with a {class}`~pytask.PickleNode` and returns text whereas {class}`pathlib.Path`s become {class}`~pytask.PathNode`s and return their path. :::{note} Whether the module `task_data_catalog.py` is importable depends on whether it is on your -`PYTHONPATH`, a variable that defines where modules can be found. It is easy, if you -develop your workflow as a Python package as explained in the tutorials. Then, you can -import the data catalog with, for example, `from myproject.config import data_catalog`. +`PYTHONPATH`, a variable that defines where modules can be found. If you develop your +workflow as a Python package as explained in the tutorials, then, you can import the +data catalog with `from myproject.config import data_catalog`. ::: ## Changing the default node From ab65f9d17e45ecc2aeeebb4007ebcb8f1a258c76 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sat, 28 Oct 2023 09:58:20 +0200 Subject: [PATCH 21/28] Fix. --- docs/source/how_to_guides/using_a_data_catalog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md index 39c095da..273ddc72 100644 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -20,7 +20,7 @@ actions. At first, we create the file `input.txt` and add some content. -```ipython +```pycon >>> from pathlib import Path >>> Path("input.txt").write_text("Hello, ") ``` From 38aac8213bfd6c7e6368d273fa6b48fcd03a9c3e Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Tue, 31 Oct 2023 19:19:18 +0100 Subject: [PATCH 22/28] fix. --- .../how_to_guides/using_a_data_catalog.md | 35 +++++++++++-------- docs/source/tutorials/index.md | 1 + docs/source/tutorials/using_a_data_catalog.md | 1 + 3 files changed, 22 insertions(+), 15 deletions(-) create mode 100644 docs/source/tutorials/using_a_data_catalog.md diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md index 273ddc72..009297f2 100644 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ b/docs/source/how_to_guides/using_a_data_catalog.md @@ -14,7 +14,7 @@ Jupyter notebook and load the data for exploration. As an example, we build a workflow comprising of two tasks that do the following actions. -1. Read in data from a text file, `input.txt` and storing it as a pickle file. +1. Read in data from a text file, `input.txt` and storing the text as a pickle file. 1. Read the data from pickle, adding additional text and storing it as a text file under `output.txt`. @@ -25,11 +25,13 @@ At first, we create the file `input.txt` and add some content. >>> Path("input.txt").write_text("Hello, ") ``` -Secondly, we build the data catalog by registering the data that we provide or that we -later want to access. You can add nodes, objects that are converted to nodes like -`~pathlib.Path` or any other object. +Secondly, we build the data catalog in a file separate from our task modules like +`config.py`, so we can import the data catalog everywhere. Register your data here. You +can add nodes, objects that are converted to nodes like `~pathlib.Path` or any other +object. ```python +# Content of config.py. from pathlib import Path from pytask import DataCatalog @@ -44,27 +46,30 @@ data_catalog.add("input", Path("input.txt")) data_catalog.add("output", Path("output.txt")) ``` -Next, let us define the two tasks. +Next, let us define the two tasks. We are using return annotations for the second task +that are explained in this [guide](using_task_returns.md), but you can use any approach +from [../tutorials/defining_dependencies_products.md]. ```python def task_save_text_with_pickle( path: Annotated[Path, data_catalog["input"]] -) -> Annotated[str, data_catalog["pickle_file"]]: +) -> Annotated[str, data_catalog["intermediate"]]: text = path.read_text() return text def task_add_content_and_save_text( - text: Annotated[str, data_catalog["pickle_file"]] + text: Annotated[str, data_catalog["intermediate"]] ) -> Annotated[str, data_catalog["output"]]: text += "World!" return text ``` -The important bit here is that we reference the intermediate pickle file with -`data_catalog["pickle_file"]`. Since the entry does not exist, the catalog creates a -{class}`~pytask.PickleNode` for this entry and saves the pickle file in a `.pytask` -directory. +The important bit here is that we reference the intermediate pickle file in the +annotation `Annotated[str, data_catalog["intermediate"]]`. Since the entry +`"intermediate"` does not exist, the catalog creates a {class}`~pytask.PickleNode` for +this entry and saves the pickle file in a `.pytask` directory. The annotated type of the +variable is {obj}`str`, since we store and load a string from the pickle file. Now, we can execute the tasks. @@ -78,21 +83,21 @@ or in the terminal in the Python interpreter. Call the {meth}`~pytask.PNode.load of a node to access its value. ```pycon ->>> from task_data_catalog import data_catalog +>>> from task_create_text import data_catalog >>> data_catalog.entries ['pickle_file', 'input', 'output'] ->>> data_catalog["pickle_file"].load() +>>> data_catalog["intermediate"].load() 'Hello, World!' >>> data_catalog["output"].load() WindowsPath('C:\Users\pytask-dev\git\my_project\output.txt') ``` -`data_catalog["pickle_file"]` was stored with a {class}`~pytask.PickleNode` and returns +`data_catalog["intermediate"]` was stored with a {class}`~pytask.PickleNode` and returns text whereas {class}`pathlib.Path`s become {class}`~pytask.PathNode`s and return their path. :::{note} -Whether the module `task_data_catalog.py` is importable depends on whether it is on your +Whether the module `task_create_text.py` is importable depends on whether it is on your `PYTHONPATH`, a variable that defines where modules can be found. If you develop your workflow as a Python package as explained in the tutorials, then, you can import the data catalog with `from myproject.config import data_catalog`. diff --git a/docs/source/tutorials/index.md b/docs/source/tutorials/index.md index 4a1c862a..d10a85d1 100644 --- a/docs/source/tutorials/index.md +++ b/docs/source/tutorials/index.md @@ -11,6 +11,7 @@ installation set_up_a_project write_a_task defining_dependencies_products +using_a_data_catalog invoking_pytask configuration plugins diff --git a/docs/source/tutorials/using_a_data_catalog.md b/docs/source/tutorials/using_a_data_catalog.md new file mode 100644 index 00000000..b9caa1f0 --- /dev/null +++ b/docs/source/tutorials/using_a_data_catalog.md @@ -0,0 +1 @@ +# Using a data catalog From d0e626f7bec282663a59c110c5a062597815499b Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 1 Nov 2023 17:51:07 +0100 Subject: [PATCH 23/28] Add tutorial. --- .pre-commit-config.yaml | 1 + .../md/defining-dependencies-products.md | 26 +++ docs/source/reference_guides/api.md | 5 +- .../defining_dependencies_products.md | 52 ++++- docs/source/tutorials/using_a_data_catalog.md | 219 ++++++++++++++++++ docs/source/tutorials/write_a_task.md | 2 +- ...encies_products_dependencies_decorators.py | 10 +- ...ndencies_products_dependencies_produces.py | 10 +- ...ependencies_products_dependencies_py310.py | 10 +- ...dependencies_products_dependencies_py38.py | 10 +- docs_src/tutorials/using_a_data_catalog_1.py | 10 + .../using_a_data_catalog_2_produces.py | 17 ++ .../tutorials/using_a_data_catalog_2_py310.py | 22 ++ .../using_a_data_catalog_2_py310_return.py | 17 ++ .../tutorials/using_a_data_catalog_2_py38.py | 21 ++ .../tutorials/using_a_data_catalog_3_py310.py | 19 ++ .../tutorials/using_a_data_catalog_3_py38.py | 19 ++ docs_src/tutorials/using_a_data_catalog_4.py | 14 ++ .../tutorials/using_a_data_catalog_5_py310.py | 16 ++ .../using_a_data_catalog_5_py310_return.py | 13 ++ .../tutorials/using_a_data_catalog_5_py38.py | 16 ++ pyproject.toml | 2 +- src/_pytask/nodes.py | 22 +- tests/test_execute.py | 20 ++ 24 files changed, 541 insertions(+), 32 deletions(-) create mode 100644 docs/source/_static/md/defining-dependencies-products.md create mode 100644 docs_src/tutorials/using_a_data_catalog_1.py create mode 100644 docs_src/tutorials/using_a_data_catalog_2_produces.py create mode 100644 docs_src/tutorials/using_a_data_catalog_2_py310.py create mode 100644 docs_src/tutorials/using_a_data_catalog_2_py310_return.py create mode 100644 docs_src/tutorials/using_a_data_catalog_2_py38.py create mode 100644 docs_src/tutorials/using_a_data_catalog_3_py310.py create mode 100644 docs_src/tutorials/using_a_data_catalog_3_py38.py create mode 100644 docs_src/tutorials/using_a_data_catalog_4.py create mode 100644 docs_src/tutorials/using_a_data_catalog_5_py310.py create mode 100644 docs_src/tutorials/using_a_data_catalog_5_py310_return.py create mode 100644 docs_src/tutorials/using_a_data_catalog_5_py38.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 308f042d..859c5193 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -116,6 +116,7 @@ repos: docs/source/tutorials/repeating_tasks_with_different_inputs.md| docs/source/tutorials/selecting_tasks.md| docs/source/tutorials/set_up_a_project.md| + docs/source/tutorials/using_a_data_catalog.md| docs/source/tutorials/write_a_task.md )$ - repo: https://github.com/nbQA-dev/nbQA diff --git a/docs/source/_static/md/defining-dependencies-products.md b/docs/source/_static/md/defining-dependencies-products.md new file mode 100644 index 00000000..6e734b0e --- /dev/null +++ b/docs/source/_static/md/defining-dependencies-products.md @@ -0,0 +1,26 @@ +
+ +```console + +$ pytask +──────────────────────────── Start pytask session ──────────────────────────── +Platform: win32 -- Python 3.10.0, pytask 0.4.0, pluggy 1.0.0 +Root: C:\Users\pytask-dev\git\my_project +Collected 2 task. + +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ +┃ Task ┃ Outcome ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ +│ task_data_preparation.py::task_create_random_data │ . │ +│ task_plot_data.py::task_plot_data │ . │ +└───────────────────────────────────────────────────┴─────────┘ + +────────────────────────────────────────────────────────────────────────────── +╭─────────── Summary ────────────╮ + 2 Collected tasks + 2 Succeeded (100.0%) +╰────────────────────────────────╯ +───────────────────────── Succeeded in 0.06 seconds ────────────────────────── +``` + +
diff --git a/docs/source/reference_guides/api.md b/docs/source/reference_guides/api.md index aa6bb789..da0790ff 100644 --- a/docs/source/reference_guides/api.md +++ b/docs/source/reference_guides/api.md @@ -33,7 +33,7 @@ To write to the terminal, use pytask's console. pytask uses marks to attach additional information to task functions which is processed by the host or by plugins. The following marks are available by default. -### Marks +### Built-in marks ```{eval-rst} .. function:: pytask.mark.depends_on(objects: Any | Iterable[Any] | dict[Any, Any]) @@ -263,8 +263,11 @@ Nodes are the interface for different kinds of dependencies or products. ```{eval-rst} .. autoclass:: pytask.PathNode + :members: load, save .. autoclass:: pytask.PickleNode + :members: load, save .. autoclass:: pytask.PythonNode + :members: load, save ``` To parse dependencies and products from nodes, use the following functions. diff --git a/docs/source/tutorials/defining_dependencies_products.md b/docs/source/tutorials/defining_dependencies_products.md index 97927b64..63651d4d 100644 --- a/docs/source/tutorials/defining_dependencies_products.md +++ b/docs/source/tutorials/defining_dependencies_products.md @@ -3,22 +3,47 @@ To ensure pytask executes all tasks in the correct order, you need to define dependencies and products for each task. -This tutorial offers you different interfaces. One important difference between them is -that if you are comfortable with type annotations or not afraid to try them, take a look -at the tabs named `Python 3.10+` or `Python 3.8+`. +This tutorial offers you different interfaces. If you are comfortable with type +annotations or not afraid to try them, take a look at the tabs named `Python 3.10+` or +`Python 3.8+`. If you want to avoid type annotations for now, look at the tab named `produces`. +The deprecated approaches can be found in the tabs named `Decorators`. + ```{seealso} An overview on the different interfaces and their strength and weaknesses is given in {doc}`../explanations/interfaces_for_dependencies_products`. ``` -Let's first focus on how to define products which should already be familiar to you. +First, we focus on how to define products which should already be familiar to you. Then, +we focus on how task dependencies can be declared. + +We use the same project layout as before and add a `task_plot_data.py` module. + +```text +my_project +├───pyproject.toml +│ +├───src +│ └───my_project +│ ├────config.py +│ ├────task_data_preparation.py +│ └────task_plot_data.py +│ +├───setup.py +│ +├───.pytask.sqlite3 +│ +└───bld + ├────data.pkl + └────plot.png +``` ## Products -Let's revisit the task from the {doc}`previous tutorial `. +Let's revisit the task from the {doc}`previous tutorial ` that we defined +in `task_data_preparation.py`. ::::{tab-set} @@ -90,7 +115,9 @@ beneficial for handling paths conveniently and across platforms. Most tasks have dependencies and it is important to specify. Then, pytask ensures that the dependencies are available before executing the task. -In the example you see a task that creates a plot while relying on some data set. +As an example, we want to extend our project with another task that plots the data that +we generated with `task_create_random_data`. The task is called `task_plot_data` and we +will define it in `task_plot_data.py`. ::::{tab-set} @@ -104,7 +131,7 @@ pytask assumes that all function arguments that do not have the {class}`~pytask. annotation are dependencies of the task. ```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_dependencies_py310.py -:emphasize-lines: 9 +:emphasize-lines: 11 ``` ::: @@ -119,7 +146,7 @@ pytask assumes that all function arguments that do not have the {class}`~pytask. annotation are dependencies of the task. ```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_dependencies_py38.py -:emphasize-lines: 9 +:emphasize-lines: 11 ``` ::: @@ -134,7 +161,7 @@ pytask assumes that all function arguments that are not passed to the argument `produces` are dependencies of the task. ```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_dependencies_produces.py -:emphasize-lines: 7 +:emphasize-lines: 9 ``` ::: @@ -152,12 +179,17 @@ Equivalent to products, you can use the access the dependency path inside the function and load the data. ```{literalinclude} ../../../docs_src/tutorials/defining_dependencies_products_dependencies_decorators.py -:emphasize-lines: 7, 9 +:emphasize-lines: 9, 11 ``` ::: :::: +Now, let us execute the two paths. + +```{include} ../_static/md/defining-dependencies-products.md +``` + ## Relative paths Dependencies and products do not have to be absolute paths. If paths are relative, they diff --git a/docs/source/tutorials/using_a_data_catalog.md b/docs/source/tutorials/using_a_data_catalog.md index b9caa1f0..6d2aa4e7 100644 --- a/docs/source/tutorials/using_a_data_catalog.md +++ b/docs/source/tutorials/using_a_data_catalog.md @@ -1 +1,220 @@ # Using a data catalog + +The [previous tutorial](defining_dependencies_products.md) explained how to use paths to +define dependencies and products. + +Two things will quickly become a nuisance in bigger projects. + +1. We have to define the same paths again and again. +1. We have to define paths to files that we are not particularly interested in since + they are just intermediate representations. + +As a solution, pytask offers a {class}`~pytask.DataCatalog` which is a purely optional +feature. The tutorial focuses on the main features. To learn about all features, read +the [how-to guide](../how_to_guides/using_a_data_catalog.md). + +Let us focus on the previous example and see how the {class}`~pytask.DataCatalog` helps +us. + +The project structure is the same as in the previous example with the exception of the +`.pytask` folder and the missing `data.pkl` in `bld`. + +```text +my_project +├───.pytask +│ +├───pyproject.toml +│ +├───src +│ └───my_project +│ ├────config.py +│ ├────task_data_preparation.py +│ └────task_plot_data.py +│ +├───setup.py +│ +├───.pytask.sqlite3 +│ +└───bld + └────plot.png +``` + +## The `DataCatalog` + +At first, we define the data catalog in `config.py`. + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_1.py +``` + +## `task_data_preparation` + +Next, we will use the data catalog to save the product of the task in +`task_data_preparation.py`. + +Instead of using a path, we set the location of the product in the data catalog with +`data_catalog["data"]`. If the key does not exist, the data catalog will automatically +create a {class}`~pytask.PickleNode` that allows you to save any Python object to a +`pickle` file. The `pickle` file is stored within the `.pytask` folder. + +The following tabs show you how to use the data catalog given the interface you prefer. + +::::{tab-set} + +:::{tab-item} Python 3.10+ +:sync: python310plus + +Use `data_catalog["key"]` as an default argument to access the +{class}`~pytask.PickleNode` within the task. When you are done transforming your +{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_2_py310.py +:emphasize-lines: 11, 22 +``` + +::: + +:::{tab-item} Python 3.8+ +:sync: python38plus + +Use `data_catalog["key"]` as an default argument to access the +{class}`~pytask.PickleNode` within the task. When you are done transforming your +{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_2_py38.py +:emphasize-lines: 10, 21 +``` + +::: + +:::{tab-item} ​`produces` +:sync: produces + +Use `data_catalog["key"]` as an default argument to access the +{class}`~pytask.PickleNode` within the task. When you are done transforming your +{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_2_produces.py +:emphasize-lines: 7, 17 +``` + +::: + +:::{tab-item} ​Python 3.10+ & Return +:sync: return + +An elegant way to use the data catalog is via return type annotations. Add +`data_catalog["data"]` to the annotated return and simply return the +{class}`~pandas.DataFrame` to store it. + +You can read more about return type annotations in +[Using task returns](../how_to_guides/using_task_returns.md). + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_2_py310_return.py +:emphasize-lines: 8, 17 +``` + +::: +:::: + +## `task_plot_data` + +Next, we will define the second task that consumes the data set from the previous task. +Following one of the interfaces gives you immediate access to the +{class}`~pandas.DataFrame` in the task without any additional line to load it. + +::::{tab-set} + +:::{tab-item} Python 3.10+ +:sync: python310plus + +Use `data_catalog["key"]` as an default argument to access the +{class}`~pytask.PickleNode` within the task. When you are done transforming your +{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_3_py310.py +:emphasize-lines: 12 +``` + +::: + +:::{tab-item} Python 3.8+ +:sync: python38plus + +Use `data_catalog["key"]` as an default argument to access the +{class}`~pytask.PickleNode` within the task. When you are done transforming your +{class}`~pandas.DataFrame`, save it with {meth}`~pytask.PickleNode.save`. + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_3_py38.py +:emphasize-lines: 12 +``` + +::: +:::: + +## Adding data to the catalog + +In most projects, you have other data sets that you would like to access via the data +catalog. To add them, call the {meth}`~pytask.DataCatalog.add` method and supply a name +and a path. + +Let's add `file.csv` to the data catalog. + +```text +my_project +├───.pytask +│ +├───pyproject.toml +│ +├───src +│ └───my_project +│ ├────config.py +│ ├────file.csv +│ ├────task_data_preparation.py +│ └────task_plot_data.py +│ +├───setup.py +│ +├───.pytask.sqlite3 +│ +└───bld + ├────file.pkl + └────plot.png +``` + +The path can be absolute or relative to the module of the data catalog. + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_4.py +``` + +You can now use the data catalog as in previous example and use the +{class}`~~pathlib.Path` in the task. + +::::{tab-set} + +:::{tab-item} Python 3.10+ +:sync: python310plus + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_5_py310.py +:emphasize-lines: 11, 12 +``` + +::: + +:::{tab-item} Python 3.8+ +:sync: python38plus + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_5_py38.py +:emphasize-lines: 11, 12 +``` + +::: + +:::{tab-item} ​Python 3.10+ & Return +:sync: return + +```{literalinclude} ../../../docs_src/tutorials/using_a_data_catalog_5_py310_return.py +:emphasize-lines: 9, 10 +``` + +::: +:::: diff --git a/docs/source/tutorials/write_a_task.md b/docs/source/tutorials/write_a_task.md index 1fe5d607..bef620f0 100644 --- a/docs/source/tutorials/write_a_task.md +++ b/docs/source/tutorials/write_a_task.md @@ -10,7 +10,7 @@ in `bld/data.pkl`. The `task_` prefix for modules and task functions is important so that pytask automatically discovers them. -``` +```text my_project ├───pyproject.toml │ diff --git a/docs_src/tutorials/defining_dependencies_products_dependencies_decorators.py b/docs_src/tutorials/defining_dependencies_products_dependencies_decorators.py index 6dc83838..3f86a1a0 100644 --- a/docs_src/tutorials/defining_dependencies_products_dependencies_decorators.py +++ b/docs_src/tutorials/defining_dependencies_products_dependencies_decorators.py @@ -1,5 +1,7 @@ from pathlib import Path +import matplotlib.pyplot as plt +import pandas as pd import pytask from my_project.config import BLD @@ -7,4 +9,10 @@ @pytask.mark.depends_on(BLD / "data.pkl") @pytask.mark.produces(BLD / "plot.png") def task_plot_data(depends_on: Path, produces: Path) -> None: - ... + df = pd.read_pickle(depends_on) + + _, ax = plt.subplots() + df.plot(x="x", y="y", ax=ax, kind="scatter") + + plt.savefig(produces) + plt.close() diff --git a/docs_src/tutorials/defining_dependencies_products_dependencies_produces.py b/docs_src/tutorials/defining_dependencies_products_dependencies_produces.py index 75c570d3..9fd6aeaf 100644 --- a/docs_src/tutorials/defining_dependencies_products_dependencies_produces.py +++ b/docs_src/tutorials/defining_dependencies_products_dependencies_produces.py @@ -1,9 +1,17 @@ from pathlib import Path +import matplotlib.pyplot as plt +import pandas as pd from my_project.config import BLD def task_plot_data( path_to_data: Path = BLD / "data.pkl", produces: Path = BLD / "plot.png" ) -> None: - ... + df = pd.read_pickle(path_to_data) + + _, ax = plt.subplots() + df.plot(x="x", y="y", ax=ax, kind="scatter") + + plt.savefig(produces) + plt.close() diff --git a/docs_src/tutorials/defining_dependencies_products_dependencies_py310.py b/docs_src/tutorials/defining_dependencies_products_dependencies_py310.py index f412852a..fa728df4 100644 --- a/docs_src/tutorials/defining_dependencies_products_dependencies_py310.py +++ b/docs_src/tutorials/defining_dependencies_products_dependencies_py310.py @@ -1,6 +1,8 @@ from pathlib import Path from typing import Annotated +import matplotlib.pyplot as plt +import pandas as pd from my_project.config import BLD from pytask import Product @@ -9,4 +11,10 @@ def task_plot_data( path_to_data: Path = BLD / "data.pkl", path_to_plot: Annotated[Path, Product] = BLD / "plot.png", ) -> None: - ... + df = pd.read_pickle(path_to_data) + + _, ax = plt.subplots() + df.plot(x="x", y="y", ax=ax, kind="scatter") + + plt.savefig(path_to_plot) + plt.close() diff --git a/docs_src/tutorials/defining_dependencies_products_dependencies_py38.py b/docs_src/tutorials/defining_dependencies_products_dependencies_py38.py index 2073ac88..e96cea36 100644 --- a/docs_src/tutorials/defining_dependencies_products_dependencies_py38.py +++ b/docs_src/tutorials/defining_dependencies_products_dependencies_py38.py @@ -1,5 +1,7 @@ from pathlib import Path +import matplotlib.pyplot as plt +import pandas as pd from my_project.config import BLD from pytask import Product from typing_extensions import Annotated @@ -9,4 +11,10 @@ def task_plot_data( path_to_data: Path = BLD / "data.pkl", path_to_plot: Annotated[Path, Product] = BLD / "plot.png", ) -> None: - ... + df = pd.read_pickle(path_to_data) + + _, ax = plt.subplots() + df.plot(x="x", y="y", ax=ax, kind="scatter") + + plt.savefig(path_to_plot) + plt.close() diff --git a/docs_src/tutorials/using_a_data_catalog_1.py b/docs_src/tutorials/using_a_data_catalog_1.py new file mode 100644 index 00000000..bbe5838b --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_1.py @@ -0,0 +1,10 @@ +from pathlib import Path + +from pytask import DataCatalog + + +SRC = Path(__file__).parent.resolve() +BLD = SRC.joinpath("..", "..", "bld").resolve() + + +data_catalog = DataCatalog() diff --git a/docs_src/tutorials/using_a_data_catalog_2_produces.py b/docs_src/tutorials/using_a_data_catalog_2_produces.py new file mode 100644 index 00000000..77a082fb --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_2_produces.py @@ -0,0 +1,17 @@ +import numpy as np +import pandas as pd +from my_project.config import data_catalog +from my_project.config import PickleNode + + +def task_create_random_data(produces: PickleNode = data_catalog["data"]) -> None: + rng = np.random.default_rng(0) + beta = 2 + + x = rng.normal(loc=5, scale=10, size=1_000) + epsilon = rng.standard_normal(1_000) + + y = beta * x + epsilon + + df = pd.DataFrame({"x": x, "y": y}) + produces.save(df) diff --git a/docs_src/tutorials/using_a_data_catalog_2_py310.py b/docs_src/tutorials/using_a_data_catalog_2_py310.py new file mode 100644 index 00000000..41074cdc --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_2_py310.py @@ -0,0 +1,22 @@ +from typing import Annotated + +import numpy as np +import pandas as pd +from my_project.config import data_catalog +from pytask import PickleNode +from pytask import Product + + +def task_create_random_data( + node: Annotated[PickleNode, Product] = data_catalog["data"] +) -> None: + rng = np.random.default_rng(0) + beta = 2 + + x = rng.normal(loc=5, scale=10, size=1_000) + epsilon = rng.standard_normal(1_000) + + y = beta * x + epsilon + + df = pd.DataFrame({"x": x, "y": y}) + node.save(df) diff --git a/docs_src/tutorials/using_a_data_catalog_2_py310_return.py b/docs_src/tutorials/using_a_data_catalog_2_py310_return.py new file mode 100644 index 00000000..1c901d5e --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_2_py310_return.py @@ -0,0 +1,17 @@ +from typing import Annotated + +import numpy as np +import pandas as pd +from my_project.config import data_catalog + + +def task_create_random_data() -> Annotated[pd.DataFrame, data_catalog["data"]]: + rng = np.random.default_rng(0) + beta = 2 + + x = rng.normal(loc=5, scale=10, size=1_000) + epsilon = rng.standard_normal(1_000) + + y = beta * x + epsilon + + return pd.DataFrame({"x": x, "y": y}) diff --git a/docs_src/tutorials/using_a_data_catalog_2_py38.py b/docs_src/tutorials/using_a_data_catalog_2_py38.py new file mode 100644 index 00000000..0fc41b52 --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_2_py38.py @@ -0,0 +1,21 @@ +import numpy as np +import pandas as pd +from my_project.config import data_catalog +from pytask import PickleNode +from pytask import Product +from typing_extensions import Annotated + + +def task_create_random_data( + node: Annotated[PickleNode, Product] = data_catalog["data"] +) -> None: + rng = np.random.default_rng(0) + beta = 2 + + x = rng.normal(loc=5, scale=10, size=1_000) + epsilon = rng.standard_normal(1_000) + + y = beta * x + epsilon + + df = pd.DataFrame({"x": x, "y": y}) + node.save(df) diff --git a/docs_src/tutorials/using_a_data_catalog_3_py310.py b/docs_src/tutorials/using_a_data_catalog_3_py310.py new file mode 100644 index 00000000..972954af --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_3_py310.py @@ -0,0 +1,19 @@ +from pathlib import Path +from typing import Annotated + +import matplotlib.pyplot as plt +import pandas as pd +from my_project.config import BLD +from my_project.config import data_catalog +from pytask import Product + + +def task_plot_data( + df: Annotated[pd.DataFrame, data_catalog["data"]], + path_to_plot: Annotated[Path, Product] = BLD / "plot.png", +) -> None: + _, ax = plt.subplots() + df.plot(x="x", y="y", ax=ax, kind="scatter") + + plt.savefig(path_to_plot) + plt.close() diff --git a/docs_src/tutorials/using_a_data_catalog_3_py38.py b/docs_src/tutorials/using_a_data_catalog_3_py38.py new file mode 100644 index 00000000..411854a9 --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_3_py38.py @@ -0,0 +1,19 @@ +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd +from my_project.config import BLD +from my_project.config import data_catalog +from pytask import Product +from typing_extensions import Annotated + + +def task_plot_data( + df: Annotated[pd.DataFrame, data_catalog["data"]], + path_to_plot: Annotated[Path, Product] = BLD / "plot.png", +) -> None: + _, ax = plt.subplots() + df.plot(x="x", y="y", ax=ax, kind="scatter") + + plt.savefig(path_to_plot) + plt.close() diff --git a/docs_src/tutorials/using_a_data_catalog_4.py b/docs_src/tutorials/using_a_data_catalog_4.py new file mode 100644 index 00000000..55c9a961 --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_4.py @@ -0,0 +1,14 @@ +from pathlib import Path + +from pytask import DataCatalog + + +SRC = Path(__file__).parent.resolve() +BLD = SRC.joinpath("..", "..", "bld").resolve() + + +data_catalog = DataCatalog() + +# Use either a relative or a absolute path. +data_catalog.add("csv", Path("file.csv")) +data_catalog.add("transformed_csv", BLD / "file.pkl") diff --git a/docs_src/tutorials/using_a_data_catalog_5_py310.py b/docs_src/tutorials/using_a_data_catalog_5_py310.py new file mode 100644 index 00000000..08a034b6 --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_5_py310.py @@ -0,0 +1,16 @@ +from pathlib import Path +from typing import Annotated + +import pandas as pd +from my_project.config import data_catalog +from pytask import PickleNode +from pytask import Product + + +def task_transform_csv( + path: Annotated[Path, data_catalog["csv"]], + node: Annotated[PickleNode, Product] = data_catalog["transformed_csv"], +) -> None: + df = pd.read_csv(path) + ... + node.save(df) diff --git a/docs_src/tutorials/using_a_data_catalog_5_py310_return.py b/docs_src/tutorials/using_a_data_catalog_5_py310_return.py new file mode 100644 index 00000000..41d4913d --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_5_py310_return.py @@ -0,0 +1,13 @@ +from pathlib import Path +from typing import Annotated + +import pandas as pd +from my_project.config import data_catalog + + +def task_transform_csv( + path: Annotated[Path, data_catalog["csv"]], +) -> Annotated[pd.DataFrame, data_catalog["transformed_csv"]]: + df = pd.read_csv(path) + ... + return df diff --git a/docs_src/tutorials/using_a_data_catalog_5_py38.py b/docs_src/tutorials/using_a_data_catalog_5_py38.py new file mode 100644 index 00000000..5c3040bc --- /dev/null +++ b/docs_src/tutorials/using_a_data_catalog_5_py38.py @@ -0,0 +1,16 @@ +from pathlib import Path + +import pandas as pd +from my_project.config import data_catalog +from pytask import PickleNode +from pytask import Product +from typing_extensions import Annotated + + +def task_transform_csv( + path: Annotated[Path, data_catalog["csv"]], + node: Annotated[PickleNode, Product] = data_catalog["transformed_csv"], +) -> None: + df = pd.read_csv(path) + ... + node.save(df) diff --git a/pyproject.toml b/pyproject.toml index d0fc7df0..50227d07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,7 +135,7 @@ ignore = [ "tests/test_jupyter/*" = ["INP001"] "scripts/*" = ["D", "INP001"] "docs/source/conf.py" = ["D401", "INP001"] -"docs_src/*" = ["ARG001", "D", "INP001"] +"docs_src/*" = ["ARG001", "D", "INP001", "S301"] "docs_src/*/*.py" = ["FA100", "FA102", "PLR2004", "TCH"] "docs/source/how_to_guides/functional_interface*" = ["B018", "D", "INP", "ARG005"] "docs_src/how_to_guides/using_task_returns_*_task.py" = ["ARG005", "E731"] diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index 7870c3a1..c301a1f8 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -52,6 +52,7 @@ class TaskWithoutPath(PTask): Reports with entries for when, what, and content. attributes: dict[Any, Any] A dictionary to store additional information of the task. + """ name: str @@ -267,32 +268,21 @@ def state(self) -> str | None: class PickleNode: """A node for pickle files. - Parameters + Attributes ---------- name Name of the node which makes it identifiable in the DAG. path The path to the file. - load_func - A function to convert :obj:`bytes` from a pickle file to a Python object. - dump_func - A function to convert a Python object to :obj:`bytes`. """ name: str path: Path - load_func: Callable[[bytes], Any] = pickle.loads - dump_func: Callable[[Any], bytes] = pickle.dumps @classmethod - @functools.lru_cache def from_path(cls, path: Path) -> PickleNode: - """Instantiate class from path to file. - - The `lru_cache` decorator ensures that the same object is not collected twice. - - """ + """Instantiate class from path to file.""" if not path.is_absolute(): msg = "Node must be instantiated from absolute path." raise ValueError(msg) @@ -306,7 +296,9 @@ def state(self) -> str | None: def load(self, is_product: bool) -> Any: if is_product: return self - return self.load_func(self.path.read_bytes()) + with self.path.open("rb") as f: + return pickle.load(f) # noqa: S301 def save(self, value: Any) -> None: - self.path.write_bytes(self.dump_func(value)) + with self.path.open("wb") as f: + pickle.dump(value, f) diff --git a/tests/test_execute.py b/tests/test_execute.py index 710fe70a..63054a2f 100644 --- a/tests/test_execute.py +++ b/tests/test_execute.py @@ -900,3 +900,23 @@ def task_write_file(text: Annotated[str, node]) -> Annotated[str, Path("file.txt result = runner.invoke(cli, [tmp_path.as_posix()]) assert result.exit_code == ExitCode.OK assert tmp_path.joinpath("file.txt").read_text() == "Hello, World!" + + +def test_pickle_node_as_product_with_product_annotation(runner, tmp_path): + source = """ + from typing_extensions import Annotated + from pytask import Product, PickleNode + from pathlib import Path + + node = PickleNode(name="node", path=Path(__file__).parent / "file.txt") + + def task_create_string(node: Annotated[PickleNode, node, Product]) -> None: + node.save("Hello, World!") + + def task_write_file(text: Annotated[str, node]) -> Annotated[str, Path("file.txt")]: + return text + """ + tmp_path.joinpath("task_example.py").write_text(textwrap.dedent(source)) + result = runner.invoke(cli, [tmp_path.as_posix()]) + assert result.exit_code == ExitCode.OK + assert tmp_path.joinpath("file.txt").read_text() == "Hello, World!" From 5cabf22499639705206ef81a740e98b84c15cb58 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 1 Nov 2023 17:52:24 +0100 Subject: [PATCH 24/28] fix. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b08979df..7e6d463c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ _generated .eggs .pytask.sqlite3 +.pytask build dist From 52da000b6ad6da8a7776075d876767f9f7e6030f Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 1 Nov 2023 18:13:35 +0100 Subject: [PATCH 25/28] Fix. --- src/_pytask/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/_pytask/nodes.py b/src/_pytask/nodes.py index c301a1f8..75c37d70 100644 --- a/src/_pytask/nodes.py +++ b/src/_pytask/nodes.py @@ -293,7 +293,7 @@ def state(self) -> str | None: return str(self.path.stat().st_mtime) return None - def load(self, is_product: bool) -> Any: + def load(self, is_product: bool = False) -> Any: if is_product: return self with self.path.open("rb") as f: From dbab505a620c58220fdae2071a2a33702deb6ebc Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 1 Nov 2023 19:07:14 +0100 Subject: [PATCH 26/28] Align how to guide. --- docs/source/how_to_guides/index.md | 2 +- docs/source/how_to_guides/the_data_catalog.md | 67 +++++++ .../how_to_guides/using_a_data_catalog.md | 165 ------------------ docs/source/tutorials/using_a_data_catalog.md | 22 ++- 4 files changed, 89 insertions(+), 167 deletions(-) create mode 100644 docs/source/how_to_guides/the_data_catalog.md delete mode 100644 docs/source/how_to_guides/using_a_data_catalog.md diff --git a/docs/source/how_to_guides/index.md b/docs/source/how_to_guides/index.md index bd9f9ecb..5b5fb660 100644 --- a/docs/source/how_to_guides/index.md +++ b/docs/source/how_to_guides/index.md @@ -19,7 +19,7 @@ hashing_inputs_of_tasks using_task_returns writing_custom_nodes how_to_write_a_plugin -using_a_data_catalog +the_data_catalog ``` ## Best Practice Guides diff --git a/docs/source/how_to_guides/the_data_catalog.md b/docs/source/how_to_guides/the_data_catalog.md new file mode 100644 index 00000000..a66e9ee5 --- /dev/null +++ b/docs/source/how_to_guides/the_data_catalog.md @@ -0,0 +1,67 @@ +# The `DataCatalog` + +An introduction to the data catalog can be found in the +[tutorial](../tutorials/using_a_data_catalog.md). + +This guide explains some details that were left out of the tutorial. + +## Changing the default node + +The data catalog uses the {class}`~pytask.PickleNode` by default to serialize any kind +of Python object. You can use any other node that follows the {protocol}`~pytask.PNode` +protocol and register it when creating the data catalog. + +For example, use the {class}`~pytask.PythonNode` as the default. + +```python +from pytask import PythonNode + + +data_catalog = DataCatalog(default_node=PythonNode) +``` + +Or, learn to write your own node by reading {doc}`writing_custom_nodes`. + +## Changing the name and the default path + +By default, the data catalogs store their data in a directory `.pytask/data_catalogs`. +If you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the +`.pytask` folder is in the same folder as the configuration file. + +The default name for a catalog is `"default"` and so you will find its data in +`.pytask/data_catalogs/default`. If you assign a different name like +`"data_management"`, you will find the data in `.pytask/data_catalogs/data_management`. + +```python +data_catalog = DataCatalog(name="data_management") +``` + +You can also change the path where the data catalogs will be stored by changing the +`path` attribute. Here, we store the data catalog's data next to the module where the +data catalog is defined in `.data`. + +```python +from pathlib import Path + + +data_catalog = DataCatalog(path=Path(__file__).parent / ".data") +``` + +## Multiple data catalogs + +You can use multiple data catalogs when you want to separate your datasets across +multiple catalogs or when you want to use the same names multiple times (although it is +not recommended!). + +Make sure you assign different names to the data catalogs so that their data is stored +in different directories. + +```python +# Stored in .pytask/data_catalog/a +data_catalog_a = DataCatalog(name="a") + +# Stored in .pytask/data_catalog/b +data_catalog_b = DataCatalog(name="b") +``` + +Or, use different paths as explained above. diff --git a/docs/source/how_to_guides/using_a_data_catalog.md b/docs/source/how_to_guides/using_a_data_catalog.md deleted file mode 100644 index 009297f2..00000000 --- a/docs/source/how_to_guides/using_a_data_catalog.md +++ /dev/null @@ -1,165 +0,0 @@ -# Using a data catalog - -A data catalog is an inventory for your project's data. You can add your data to the -catalog and then easily add them to your task functions. - -The catalog can also handle data produced by your tasks automatically so that you do not -have to define any paths. - -Lastly, after data has been added to catalog, you can import the catalog in a script or -Jupyter notebook and load the data for exploration. - -## The `DataCatalog` - -As an example, we build a workflow comprising of two tasks that do the following -actions. - -1. Read in data from a text file, `input.txt` and storing the text as a pickle file. -1. Read the data from pickle, adding additional text and storing it as a text file under - `output.txt`. - -At first, we create the file `input.txt` and add some content. - -```pycon ->>> from pathlib import Path ->>> Path("input.txt").write_text("Hello, ") -``` - -Secondly, we build the data catalog in a file separate from our task modules like -`config.py`, so we can import the data catalog everywhere. Register your data here. You -can add nodes, objects that are converted to nodes like `~pathlib.Path` or any other -object. - -```python -# Content of config.py. -from pathlib import Path - -from pytask import DataCatalog - - -# Create the data catalog. -data_catalog = DataCatalog() - -# Register the input and the output data. Paths are assumed to be -# relative to the module where the data catalog is instantiated. -data_catalog.add("input", Path("input.txt")) -data_catalog.add("output", Path("output.txt")) -``` - -Next, let us define the two tasks. We are using return annotations for the second task -that are explained in this [guide](using_task_returns.md), but you can use any approach -from [../tutorials/defining_dependencies_products.md]. - -```python -def task_save_text_with_pickle( - path: Annotated[Path, data_catalog["input"]] -) -> Annotated[str, data_catalog["intermediate"]]: - text = path.read_text() - return text - - -def task_add_content_and_save_text( - text: Annotated[str, data_catalog["intermediate"]] -) -> Annotated[str, data_catalog["output"]]: - text += "World!" - return text -``` - -The important bit here is that we reference the intermediate pickle file in the -annotation `Annotated[str, data_catalog["intermediate"]]`. Since the entry -`"intermediate"` does not exist, the catalog creates a {class}`~pytask.PickleNode` for -this entry and saves the pickle file in a `.pytask` directory. The annotated type of the -variable is {obj}`str`, since we store and load a string from the pickle file. - -Now, we can execute the tasks. - -```{include} ../_static/md/using-a-data-catalog.md -``` - -## Developing with the `DataCatalog` - -After you executed the workflow, you can import the data catalog in a Jupyter notebook -or in the terminal in the Python interpreter. Call the {meth}`~pytask.PNode.load` method -of a node to access its value. - -```pycon ->>> from task_create_text import data_catalog ->>> data_catalog.entries -['pickle_file', 'input', 'output'] ->>> data_catalog["intermediate"].load() -'Hello, World!' ->>> data_catalog["output"].load() -WindowsPath('C:\Users\pytask-dev\git\my_project\output.txt') -``` - -`data_catalog["intermediate"]` was stored with a {class}`~pytask.PickleNode` and returns -text whereas {class}`pathlib.Path`s become {class}`~pytask.PathNode`s and return their -path. - -:::{note} -Whether the module `task_create_text.py` is importable depends on whether it is on your -`PYTHONPATH`, a variable that defines where modules can be found. If you develop your -workflow as a Python package as explained in the tutorials, then, you can import the -data catalog with `from myproject.config import data_catalog`. -::: - -## Changing the default node - -The data catalog uses the {class}`~pytask.PickleNode` by default to serialize any kind -of Python object. You can use any other node that follows the {protocol}`~pytask.PNode` -protocol and register it when creating the data catalog. - -For example, use the {class}`~pytask.PythonNode` as the default. - -```python -from pytask import PythonNode - - -data_catalog = DataCatalog(default_node=PythonNode) -``` - -Or, learn to write your own node by reading {doc}`writing_custom_nodes`. - -## Changing the name and the default path - -By default, the data catalogs store their data in a directory `.pytask/data_catalogs`. -If you use a `pyproject.toml` with a `[tool.pytask.ini_options]` section, then the -`.pytask` folder is in the same folder as the configuration file. - -The default name for a catalog is `"default"` and so you will find its data in -`.pytask/data_catalogs/default`. If you assign a different name like -`"data_management"`, you will find the data in `.pytask/data_catalogs/data_management`. - -```python -data_catalog = DataCatalog(name="data_management") -``` - -You can also change the path where the data catalogs will be stored by changing the -`path` attribute. Here, we store the data catalog's data next to the module where the -data catalog is defined in `.data`. - -```python -from pathlib import Path - - -data_catalog = DataCatalog(path=Path(__file__).parent / ".data") -``` - -## Multiple data catalogs - -You can use multiple data catalogs when you want to separate your datasets across -multiple catalogs or when you want to use the same names multiple times (although it is -not recommended!). - -Make sure you assign different names to the data catalogs so that their data is stored -in different directories. - -```python -# Stored in .pytask/data_catalog/a -data_catalog = DataCatalog(name="a") - -# Stored in .pytask/data_catalog/b -data_catalog = DataCatalog(name="b") -``` - -Or, use different paths as explained above. diff --git a/docs/source/tutorials/using_a_data_catalog.md b/docs/source/tutorials/using_a_data_catalog.md index 6d2aa4e7..7e73ebc7 100644 --- a/docs/source/tutorials/using_a_data_catalog.md +++ b/docs/source/tutorials/using_a_data_catalog.md @@ -11,7 +11,7 @@ Two things will quickly become a nuisance in bigger projects. As a solution, pytask offers a {class}`~pytask.DataCatalog` which is a purely optional feature. The tutorial focuses on the main features. To learn about all features, read -the [how-to guide](../how_to_guides/using_a_data_catalog.md). +the [how-to guide](../how_to_guides/the_data_catalog.md). Let us focus on the previous example and see how the {class}`~pytask.DataCatalog` helps us. @@ -218,3 +218,23 @@ You can now use the data catalog as in previous example and use the ::: :::: + +## Developing with the `DataCatalog` + +You can also use the data catalog in a Jupyter notebook or in the terminal in the Python +interpreter. Simply import the data catalog, select a node and call the +{meth}`~pytask.PNode.load` method of a node to access its value. + +```pycon +>>> from myproject.config import data_catalog +>>> data_catalog.entries +['csv', 'data', 'transformed_csv'] +>>> data_catalog["data"].load() +DataFrame(...) +>>> data_catalog["csv"].load() +WindowsPath('C:\Users\pytask-dev\git\my_project\file.csv') +``` + +`data_catalog["data"]` was stored with a {class}`~pytask.PickleNode` and returns the +{class}`~pandas.DataFrame` whereas `data_catalog["csv"]` becomes a +{class}`~pytask.PathNode` and {meth}`~pytask.PNode.load` returns the path. From adbba61f4bf4963d2fc490402a2519989695ebe1 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Wed, 1 Nov 2023 19:23:38 +0100 Subject: [PATCH 27/28] Remove some things. --- .pre-commit-config.yaml | 1 - .../source/_static/md/using-a-data-catalog.md | 26 ------------------- docs/source/tutorials/using_a_data_catalog.md | 5 ++++ 3 files changed, 5 insertions(+), 27 deletions(-) delete mode 100644 docs/source/_static/md/using-a-data-catalog.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 859c5193..a644cdad 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -107,7 +107,6 @@ repos: docs/source/how_to_guides/using_task_returns.md| docs/source/how_to_guides/writing_custom_nodes.md| docs/source/how_to_guides/hashing_inputs_of_tasks.md| - docs/source/how_to_guides/using_a_data_catalog.md| docs/source/reference_guides/hookspecs.md| docs/source/tutorials/configuration.md| docs/source/tutorials/debugging.md| diff --git a/docs/source/_static/md/using-a-data-catalog.md b/docs/source/_static/md/using-a-data-catalog.md deleted file mode 100644 index 837ee432..00000000 --- a/docs/source/_static/md/using-a-data-catalog.md +++ /dev/null @@ -1,26 +0,0 @@ -
- -```console - -$ pytask -──────────────────────────── Start pytask session ──────────────────────────── -Platform: win32 -- Python 3.10.0, pytask 0.4.2, pluggy 1.0.0 -Root: C:\Users\pytask-dev\git\my_project -Collected 1 task. - -┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ -┃ Task ┃ Outcome ┃ -┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ -│ task_data_catalog.pytask_add_content::task_add_content │ . │ -│ task_data_catalog.pytask_add_content::task_save_text │ . │ -└────────────────────────────────────────────────────────┴─────────┘ - -────────────────────────────────────────────────────────────────────────────── -╭─────────── Summary ────────────╮ - 2 Collected tasks - 2 Succeeded (100.0%) -╰────────────────────────────────╯ -───────────────────────── Succeeded in 0.06 seconds ────────────────────────── -``` - -
diff --git a/docs/source/tutorials/using_a_data_catalog.md b/docs/source/tutorials/using_a_data_catalog.md index 7e73ebc7..b78a26b3 100644 --- a/docs/source/tutorials/using_a_data_catalog.md +++ b/docs/source/tutorials/using_a_data_catalog.md @@ -151,6 +151,11 @@ Use `data_catalog["key"]` as an default argument to access the ::: :::: +Finally, let's execute the two tasks. + +```{include} ../_static/md/defining-dependencies-products.md +``` + ## Adding data to the catalog In most projects, you have other data sets that you would like to access via the data From 6fe3ec4ae2ce18b36b8b5a39797cb3e4f2cae8de Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Thu, 2 Nov 2023 10:13:13 +0100 Subject: [PATCH 28/28] Add example of custom node. --- docs/source/how_to_guides/the_data_catalog.md | 8 +++- docs_src/how_to_guides/the_data_catalog.py | 45 +++++++++++++++++++ src/_pytask/node_protocols.py | 5 ++- 3 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 docs_src/how_to_guides/the_data_catalog.py diff --git a/docs/source/how_to_guides/the_data_catalog.md b/docs/source/how_to_guides/the_data_catalog.md index a66e9ee5..6d8b825b 100644 --- a/docs/source/how_to_guides/the_data_catalog.md +++ b/docs/source/how_to_guides/the_data_catalog.md @@ -1,4 +1,4 @@ -# The `DataCatalog` +# The `DataCatalog` - Revisited An introduction to the data catalog can be found in the [tutorial](../tutorials/using_a_data_catalog.md). @@ -22,6 +22,12 @@ data_catalog = DataCatalog(default_node=PythonNode) Or, learn to write your own node by reading {doc}`writing_custom_nodes`. +Here, is an example for a `PickleNode` that uses cloudpickle instead of the normal +`pickle` module. + +```{literalinclude} ../../../docs_src/how_to_guides/the_data_catalog.py +``` + ## Changing the name and the default path By default, the data catalogs store their data in a directory `.pytask/data_catalogs`. diff --git a/docs_src/how_to_guides/the_data_catalog.py b/docs_src/how_to_guides/the_data_catalog.py new file mode 100644 index 00000000..19e80b82 --- /dev/null +++ b/docs_src/how_to_guides/the_data_catalog.py @@ -0,0 +1,45 @@ +from pathlib import Path +from typing import Any + +import cloudpickle +from attrs import define + + +@define +class PickleNode: + """A node for pickle files. + + Attributes + ---------- + name + Name of the node which makes it identifiable in the DAG. + path + The path to the file. + + """ + + name: str + path: Path + + @classmethod + def from_path(cls, path: Path) -> "PickleNode": + """Instantiate class from path to file.""" + if not path.is_absolute(): + msg = "Node must be instantiated from absolute path." + raise ValueError(msg) + return cls(name=path.as_posix(), path=path) + + def state(self) -> str | None: + if self.path.exists(): + return str(self.path.stat().st_mtime) + return None + + def load(self, is_product: bool = False) -> Any: + if is_product: + return self + with self.path.open("rb") as f: + return cloudpickle.load(f) + + def save(self, value: Any) -> None: + with self.path.open("wb") as f: + cloudpickle.dump(value, f) diff --git a/src/_pytask/node_protocols.py b/src/_pytask/node_protocols.py index 89a14b4b..dec5f59d 100644 --- a/src/_pytask/node_protocols.py +++ b/src/_pytask/node_protocols.py @@ -45,7 +45,10 @@ def load(self, is_product: bool) -> Any: Parameters ---------- is_product - Indicates whether the node is loaded as a dependency or as a product. + Indicates whether the node is loaded as a dependency or as a product. It can + be used to return a different value when the node is loaded with a product + annotation. Then, we usually want to insert the node itself to allow the + user calling :meth:`PNode.load`. """ ...