From 805dfc1a444ca2ef237ac0ed58e8fbf599b1af30 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 28 Oct 2025 05:09:55 +0100 Subject: [PATCH 01/10] WIP: first iteration dicom support --- setup.py | 3 + src/datasets/config.py | 1 + src/datasets/features/__init__.py | 2 + src/datasets/features/dicom.py | 237 ++++++++++++++++++ src/datasets/features/features.py | 6 + src/datasets/packaged_modules/__init__.py | 5 + .../packaged_modules/dicomfolder/__init__.py | 0 .../dicomfolder/dicomfolder.py | 23 ++ tests/features/test_dicom.py | 104 ++++++++ tests/utils.py | 12 + 10 files changed, 393 insertions(+) create mode 100644 src/datasets/features/dicom.py create mode 100644 src/datasets/packaged_modules/dicomfolder/__init__.py create mode 100644 src/datasets/packaged_modules/dicomfolder/dicomfolder.py create mode 100644 tests/features/test_dicom.py diff --git a/setup.py b/setup.py index 06eee6717c8..d1ae398a00d 100644 --- a/setup.py +++ b/setup.py @@ -210,6 +210,8 @@ NIBABEL_REQUIRE = ["nibabel>=5.3.2"] +PYDICOM_REQUIRE = ["pydicom>=2.3.0"] + EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, "vision": VISION_REQUIRE, @@ -228,6 +230,7 @@ "docs": DOCS_REQUIRE, "pdfs": PDFS_REQUIRE, "nibabel": NIBABEL_REQUIRE, + "pydicom": PYDICOM_REQUIRE, } setup( diff --git a/src/datasets/config.py b/src/datasets/config.py index 3d3f12b008d..023ad0cd1c5 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -140,6 +140,7 @@ TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None +PYDICOM_AVAILABLE = importlib.util.find_spec("pydicom") is not None # Optional compression tools RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py index 40a3568039a..bf8fad3b88f 100644 --- a/src/datasets/features/__init__.py +++ b/src/datasets/features/__init__.py @@ -16,8 +16,10 @@ "Video", "Pdf", "Nifti", + "Dicom", ] from .audio import Audio +from .dicom import Dicom from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value from .image import Image from .nifti import Nifti diff --git a/src/datasets/features/dicom.py b/src/datasets/features/dicom.py new file mode 100644 index 00000000000..a4adc5e537d --- /dev/null +++ b/src/datasets/features/dicom.py @@ -0,0 +1,237 @@ +import os +from dataclasses import dataclass, field +from io import BytesIO +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union + +import pyarrow as pa + +from .. import config +from ..download.download_config import DownloadConfig +from ..table import array_cast +from ..utils.file_utils import is_local_path, xopen +from ..utils.py_utils import string_to_dict + + +if TYPE_CHECKING: + import pydicom + + from .features import FeatureType + + +@dataclass +class Dicom: + """ + **Experimental.** + Dicom [`Feature`] to read DICOM medical imaging files. + + Input: The Dicom feature accepts as input: + - A `str`: Absolute path to the DICOM file (i.e. random access is allowed). + - A `pathlib.Path`: path to the DICOM file (i.e. random access is allowed). + - A `dict` with the keys: + - `path`: String with relative path of the DICOM file in a dataset repository. + - `bytes`: Bytes of the DICOM file. + This is useful for archived files with sequential access. + + - A `pydicom.FileDataset`: pydicom dataset object. + + Args: + decode (`bool`, defaults to `True`): + Whether to decode the DICOM data. If `False`, + returns the underlying dictionary in the format `{"path": dicom_path, "bytes": dicom_bytes}`. + + Examples: + + ```py + >>> from datasets import Dataset, Dicom + >>> ds = Dataset.from_dict({"dicom": ["path/to/file.dcm"]}).cast_column("dicom", Dicom()) + >>> ds.features["dicom"] + Dicom(decode=True, id=None) + >>> ds[0]["dicom"] + + >>> ds = ds.cast_column("dicom", Dicom(decode=False)) + >>> ds[0]["dicom"] + {'bytes': None, + 'path': 'path/to/file.dcm'} + ``` + """ + + decode: bool = True + id: Optional[str] = field(default=None, repr=False) + + # Automatically constructed + dtype: ClassVar[str] = "pydicom.dataset.FileDataset" + pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()}) + _type: str = field(default="Dicom", init=False, repr=False) + + def __call__(self): + return self.pa_type + + def encode_example(self, value: Union[str, bytes, bytearray, dict, "pydicom.FileDataset"]) -> dict: + """Encode example into a format for Arrow. + + Args: + value (`str`, `bytes`, `pydicom.FileDataset` or `dict`): + Data passed as input to Dicom feature. + + Returns: + `dict` with "path" and "bytes" fields + """ + if config.PYDICOM_AVAILABLE: + import pydicom + else: + pydicom = None + + if isinstance(value, str): + return {"path": value, "bytes": None} + elif isinstance(value, Path): + return {"path": str(value.absolute()), "bytes": None} + elif isinstance(value, (bytes, bytearray)): + return {"path": None, "bytes": value} + elif pydicom is not None and isinstance(value, pydicom.dataset.FileDataset): + # pydicom FileDataset object - try to get path or convert to bytes + return encode_pydicom_dataset(value) + elif isinstance(value, dict): + if value.get("path") is not None and os.path.isfile(value["path"]): + # we set "bytes": None to not duplicate the data if they're already available locally + return {"bytes": None, "path": value.get("path")} + elif value.get("bytes") is not None or value.get("path") is not None: + # store the dicom bytes, and path is used to infer the format using the file extension + return {"bytes": value.get("bytes"), "path": value.get("path")} + else: + raise ValueError( + f"A dicom sample should have one of 'path' or 'bytes' but they are missing or None in {value}." + ) + else: + raise ValueError( + f"A dicom sample should be a string, bytes, Path, pydicom FileDataset, or dict, but got {type(value)}." + ) + + def decode_example(self, value: dict, token_per_repo_id=None) -> "pydicom.FileDataset": + """Decode example DICOM file into pydicom FileDataset object. + + Args: + value (`str` or `dict`): + A string with the absolute DICOM file path, a dictionary with + keys: + + - `path`: String with absolute or relative DICOM file path. + - `bytes`: The bytes of the DICOM file. + + token_per_repo_id (`dict`, *optional*): + To access and decode DICOM files from private repositories on + the Hub, you can pass a dictionary + repo_id (`str`) -> token (`bool` or `str`). + + Returns: + `pydicom.FileDataset` objects + """ + if not self.decode: + raise NotImplementedError("Decoding is disabled for this feature. Please use Dicom(decode=True) instead.") + + if config.PYDICOM_AVAILABLE: + import pydicom + else: + raise ImportError("To support decoding DICOM files, please install 'pydicom'.") + + if token_per_repo_id is None: + token_per_repo_id = {} + + path, bytes_ = value["path"], value["bytes"] + if bytes_ is None: + if path is None: + raise ValueError(f"A dicom should have one of 'path' or 'bytes' but both are None in {value}.") + else: + if is_local_path(path): + dicom = pydicom.dcmread(path) + else: + source_url = path.split("::")[-1] + pattern = ( + config.HUB_DATASETS_URL + if source_url.startswith(config.HF_ENDPOINT) + else config.HUB_DATASETS_HFFS_URL + ) + try: + repo_id = string_to_dict(source_url, pattern)["repo_id"] + token = token_per_repo_id.get(repo_id) + except ValueError: + token = None + download_config = DownloadConfig(token=token) + with xopen(path, "rb", download_config=download_config) as f: + dicom = pydicom.dcmread(f) + else: + bio = BytesIO(bytes_) + dicom = pydicom.dcmread(bio) + + return dicom + + def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]: + """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.""" + from .features import Value + + return ( + self + if self.decode + else { + "bytes": Value("binary"), + "path": Value("string"), + } + ) + + def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.BinaryArray]) -> pa.StructArray: + """Cast an Arrow array to the Dicom arrow storage type. + The Arrow types that can be converted to the Dicom pyarrow storage type are: + + - `pa.string()` - it must contain the "path" data + - `pa.binary()` - it must contain the DICOM bytes + - `pa.struct({"bytes": pa.binary()})` + - `pa.struct({"path": pa.string()})` + - `pa.struct({"bytes": pa.binary(), "path": pa.string()})` - order doesn't matter + + Args: + storage (`Union[pa.StringArray, pa.StructArray, pa.BinaryArray]`): + PyArrow array to cast. + + Returns: + `pa.StructArray`: Array in the Dicom arrow storage type, that is + `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. + """ + if pa.types.is_string(storage.type): + bytes_array = pa.array([None] * len(storage), type=pa.binary()) + storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null()) + elif pa.types.is_binary(storage.type): + path_array = pa.array([None] * len(storage), type=pa.string()) + storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null()) + elif pa.types.is_struct(storage.type): + if storage.type.get_field_index("bytes") >= 0: + bytes_array = storage.field("bytes") + else: + bytes_array = pa.array([None] * len(storage), type=pa.binary()) + if storage.type.get_field_index("path") >= 0: + path_array = storage.field("path") + else: + path_array = pa.array([None] * len(storage), type=pa.string()) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null()) + return array_cast(storage, self.pa_type) + + +def encode_pydicom_dataset(dicom_ds: "pydicom.FileDataset") -> dict[str, Optional[Union[str, bytes]]]: + """ + Encode a pydicom FileDataset object into a dictionary. + + If the dataset has an associated file path, returns the path. Otherwise, serializes + the dataset content into bytes. + + Args: + dicom_ds: A pydicom FileDataset object. + + Returns: + dict: A dictionary with "path" or "bytes" field. + """ + if hasattr(dicom_ds, "filename") and dicom_ds.filename: + return {"path": dicom_ds.filename, "bytes": None} + + # Serialize to bytes + buffer = BytesIO() + dicom_ds.save_as(buffer, write_like_original=False) + return {"path": None, "bytes": buffer.getvalue()} diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 88259767ae0..e706130b18a 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -41,6 +41,7 @@ from ..utils import experimental, logging from ..utils.py_utils import asdict, first_non_null_value, zip_dict from .audio import Audio +from .dicom import Dicom from .image import Image, encode_pil_image from .nifti import Nifti from .pdf import Pdf, encode_pdfplumber_pdf @@ -1272,6 +1273,7 @@ def __repr__(self): Video, Pdf, Nifti, + Dicom, ] @@ -1431,6 +1433,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni Video.__name__: Video, Pdf.__name__: Pdf, Nifti.__name__: Nifti, + Dicom.__name__: Dicom, } @@ -1767,6 +1770,9 @@ class Features(dict): - [`Nifti`] feature to store the absolute path to a NIfTI neuroimaging file, a `nibabel.Nifti1Image` object or a dictionary with the relative path to a NIfTI file ("path" key) and its bytes content ("bytes" key). This feature loads the NIfTI file lazily with nibabel. + - [`Dicom`] feature to store the absolute path to a DICOM medical imaging file, a `pydicom.dataset.FileDataset` object + or a dictionary with the relative path to a DICOM file ("path" key) and its bytes content ("bytes" key). + This feature loads the DICOM file lazily with pydicom. - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation. """ diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index 9d076df44b7..f83d406228c 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -8,6 +8,7 @@ from .audiofolder import audiofolder from .cache import cache from .csv import csv +from .dicomfolder import dicomfolder from .hdf5 import hdf5 from .imagefolder import imagefolder from .json import json @@ -48,6 +49,7 @@ def _hash_python_lines(lines: list[str]) -> str: "videofolder": (videofolder.__name__, _hash_python_lines(inspect.getsource(videofolder).splitlines())), "pdffolder": (pdffolder.__name__, _hash_python_lines(inspect.getsource(pdffolder).splitlines())), "niftifolder": (niftifolder.__name__, _hash_python_lines(inspect.getsource(niftifolder).splitlines())), + "dicomfolder": (dicomfolder.__name__, _hash_python_lines(inspect.getsource(dicomfolder).splitlines())), "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())), "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())), "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())), @@ -93,6 +95,8 @@ def _hash_python_lines(lines: list[str]) -> str: _EXTENSION_TO_MODULE.update({ext.upper(): ("pdffolder", {}) for ext in pdffolder.PdfFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext: ("niftifolder", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext.upper(): ("niftifolder", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext: ("dicomfolder", {}) for ext in dicomfolder.DicomFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext.upper(): ("dicomfolder", {}) for ext in dicomfolder.DicomFolder.EXTENSIONS}) # Used to filter data files based on extensions given a module name _MODULE_TO_EXTENSIONS: dict[str, list[str]] = {} @@ -111,3 +115,4 @@ def _hash_python_lines(lines: list[str]) -> str: _MODULE_TO_METADATA_FILE_NAMES["videofolder"] = imagefolder.ImageFolder.METADATA_FILENAMES _MODULE_TO_METADATA_FILE_NAMES["pdffolder"] = imagefolder.ImageFolder.METADATA_FILENAMES _MODULE_TO_METADATA_FILE_NAMES["niftifolder"] = imagefolder.ImageFolder.METADATA_FILENAMES +_MODULE_TO_METADATA_FILE_NAMES["dicomfolder"] = imagefolder.ImageFolder.METADATA_FILENAMES diff --git a/src/datasets/packaged_modules/dicomfolder/__init__.py b/src/datasets/packaged_modules/dicomfolder/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/packaged_modules/dicomfolder/dicomfolder.py b/src/datasets/packaged_modules/dicomfolder/dicomfolder.py new file mode 100644 index 00000000000..5eb953cc545 --- /dev/null +++ b/src/datasets/packaged_modules/dicomfolder/dicomfolder.py @@ -0,0 +1,23 @@ +import datasets + +from ..folder_based_builder import folder_based_builder + + +logger = datasets.utils.logging.get_logger(__name__) + + +class DicomFolderConfig(folder_based_builder.FolderBasedBuilderConfig): + """BuilderConfig for DicomFolder.""" + + drop_labels: bool = None + drop_metadata: bool = None + + def __post_init__(self): + super().__post_init__() + + +class DicomFolder(folder_based_builder.FolderBasedBuilder): + BASE_FEATURE = datasets.Dicom + BASE_COLUMN_NAME = "dicom" + BUILDER_CONFIG_CLASS = DicomFolderConfig + EXTENSIONS: list[str] = [".dcm", ".dicom"] diff --git a/tests/features/test_dicom.py b/tests/features/test_dicom.py new file mode 100644 index 00000000000..538f98690e9 --- /dev/null +++ b/tests/features/test_dicom.py @@ -0,0 +1,104 @@ +from pathlib import Path + +import pytest + +from datasets import Dataset, Dicom, Features +from src.datasets.features.dicom import encode_pydicom_dataset + +from ..utils import require_pydicom + + +@require_pydicom +@pytest.mark.parametrize( + "build_example", + [ + lambda dicom_path: dicom_path, + lambda dicom_path: Path(dicom_path), + lambda dicom_path: open(dicom_path, "rb").read(), + lambda dicom_path: {"path": dicom_path}, + lambda dicom_path: {"path": dicom_path, "bytes": None}, + lambda dicom_path: {"path": dicom_path, "bytes": open(dicom_path, "rb").read()}, + lambda dicom_path: {"path": None, "bytes": open(dicom_path, "rb").read()}, + lambda dicom_path: {"bytes": open(dicom_path, "rb").read()}, + ], +) +def test_dicom_feature_encode_example(tmp_path, build_example): + import pydicom + from pydicom import examples + + # Save example DICOM to temp file + dicom_path = str(tmp_path / "test_dicom.dcm") + ds = examples.ct + ds.save_as(dicom_path, write_like_original=False) + + dicom = Dicom() + encoded_example = dicom.encode_example(build_example(dicom_path)) + assert isinstance(encoded_example, dict) + assert encoded_example.keys() == {"bytes", "path"} + assert encoded_example["bytes"] is not None or encoded_example["path"] is not None + decoded_example = dicom.decode_example(encoded_example) + assert isinstance(decoded_example, pydicom.dataset.FileDataset) + + +@require_pydicom +def test_dataset_with_dicom_feature(tmp_path): + import pydicom + from pydicom import examples + + # Save example DICOM to temp file + dicom_path = str(tmp_path / "test_dicom.dcm") + ds = examples.ct + ds.save_as(dicom_path, write_like_original=False) + + data = {"dicom": [dicom_path]} + features = Features({"dicom": Dicom()}) + dset = Dataset.from_dict(data, features=features) + item = dset[0] + assert item.keys() == {"dicom"} + assert isinstance(item["dicom"], pydicom.dataset.FileDataset) + batch = dset[:1] + assert len(batch) == 1 + assert batch.keys() == {"dicom"} + assert isinstance(batch["dicom"], list) and all( + isinstance(item, pydicom.dataset.FileDataset) for item in batch["dicom"] + ) + column = dset["dicom"] + assert len(column) == 1 + assert all(isinstance(item, pydicom.dataset.FileDataset) for item in column) + + # from bytes + with open(dicom_path, "rb") as f: + data = {"dicom": [f.read()]} + dset = Dataset.from_dict(data, features=features) + item = dset[0] + assert item.keys() == {"dicom"} + assert isinstance(item["dicom"], pydicom.dataset.FileDataset) + + +@require_pydicom +def test_encode_pydicom_dataset(tmp_path): + import pydicom + from pydicom import examples + + # Save example DICOM to temp file + dicom_path = str(tmp_path / "test_dicom.dcm") + ds = examples.ct + ds.save_as(dicom_path, write_like_original=False) + + # Load and encode + img = pydicom.dcmread(dicom_path) + encoded_example = encode_pydicom_dataset(img) + dicom = Dicom() + assert isinstance(encoded_example, dict) + assert encoded_example.keys() == {"bytes", "path"} + assert encoded_example["path"] is not None and encoded_example["bytes"] is None + decoded_example = dicom.decode_example(encoded_example) + assert isinstance(decoded_example, pydicom.dataset.FileDataset) + + # test bytes only (when no filename) + img.filename = None + encoded_example_bytes = encode_pydicom_dataset(img) + assert encoded_example_bytes["bytes"] is not None + assert encoded_example_bytes["path"] is None + decoded_example_bytes = dicom.decode_example(encoded_example_bytes) + assert isinstance(decoded_example_bytes, pydicom.dataset.FileDataset) diff --git a/tests/utils.py b/tests/utils.py index b796641a290..7f050269856 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -221,6 +221,18 @@ def require_nibabel(test_case): return test_case +def require_pydicom(test_case): + """ + Decorator marking a test that requires pydicom. + + These tests are skipped when pydicom isn't installed. + + """ + if not config.PYDICOM_AVAILABLE: + test_case = unittest.skip("test requires pydicom")(test_case) + return test_case + + def require_transformers(test_case): """ Decorator marking a test that requires transformers. From 92518d2edfb500edd7995f87a094a1dc38d3c5e3 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 28 Oct 2025 06:54:16 +0100 Subject: [PATCH 02/10] update dicom tests to use more variety in examples --- src/datasets/features/dicom.py | 20 +++++++++++++------- tests/features/test_dicom.py | 4 ++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/datasets/features/dicom.py b/src/datasets/features/dicom.py index a4adc5e537d..072a36576b5 100644 --- a/src/datasets/features/dicom.py +++ b/src/datasets/features/dicom.py @@ -2,7 +2,7 @@ from dataclasses import dataclass, field from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, TypedDict, Union import pyarrow as pa @@ -13,6 +13,11 @@ from ..utils.py_utils import string_to_dict +class DicomDict(TypedDict): + bytes: Optional[bytes] + path: Optional[str] + + if TYPE_CHECKING: import pydicom @@ -107,13 +112,14 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "pydicom.File f"A dicom sample should be a string, bytes, Path, pydicom FileDataset, or dict, but got {type(value)}." ) - def decode_example(self, value: dict, token_per_repo_id=None) -> "pydicom.FileDataset": + def decode_example( + self, value: DicomDict, token_per_repo_id: Optional[Dict[str, Union[str, bool]]] = None + ) -> "pydicom.FileDataset": """Decode example DICOM file into pydicom FileDataset object. Args: - value (`str` or `dict`): - A string with the absolute DICOM file path, a dictionary with - keys: + value (`dict`): + A dictionary with keys: - `path`: String with absolute or relative DICOM file path. - `bytes`: The bytes of the DICOM file. @@ -160,8 +166,8 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "pydicom.FileDa with xopen(path, "rb", download_config=download_config) as f: dicom = pydicom.dcmread(f) else: - bio = BytesIO(bytes_) - dicom = pydicom.dcmread(bio) + bytesio = BytesIO(bytes_) + dicom = pydicom.dcmread(bytesio) return dicom diff --git a/tests/features/test_dicom.py b/tests/features/test_dicom.py index 538f98690e9..e3263983b8f 100644 --- a/tests/features/test_dicom.py +++ b/tests/features/test_dicom.py @@ -47,7 +47,7 @@ def test_dataset_with_dicom_feature(tmp_path): # Save example DICOM to temp file dicom_path = str(tmp_path / "test_dicom.dcm") - ds = examples.ct + ds = examples.mr ds.save_as(dicom_path, write_like_original=False) data = {"dicom": [dicom_path]} @@ -82,7 +82,7 @@ def test_encode_pydicom_dataset(tmp_path): # Save example DICOM to temp file dicom_path = str(tmp_path / "test_dicom.dcm") - ds = examples.ct + ds = examples.rt_ss ds.save_as(dicom_path, write_like_original=False) # Load and encode From 2fd00e91a03be4f831909e68d747007c56f25a37 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 28 Oct 2025 07:01:06 +0100 Subject: [PATCH 03/10] update tests and name dicom test file according to source --- setup.py | 2 +- tests/features/data/test_dicom_693_J2KI.dcm | Bin 0 -> 3590 bytes tests/features/test_dicom.py | 29 ++++++++++++++++++-- 3 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 tests/features/data/test_dicom_693_J2KI.dcm diff --git a/setup.py b/setup.py index d1ae398a00d..2dc3be785cb 100644 --- a/setup.py +++ b/setup.py @@ -210,7 +210,7 @@ NIBABEL_REQUIRE = ["nibabel>=5.3.2"] -PYDICOM_REQUIRE = ["pydicom>=2.3.0"] +PYDICOM_REQUIRE = ["pydicom>=3.0.1"] EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, diff --git a/tests/features/data/test_dicom_693_J2KI.dcm b/tests/features/data/test_dicom_693_J2KI.dcm new file mode 100644 index 0000000000000000000000000000000000000000..adca15ec41730073363ed11e059f92e001661514 GIT binary patch literal 3590 zcmc&$d011&7N3MIAYcGd6p#y21(a~_&CLQTCX%oyA!KoniUJjtO%N#Ht`+Kv2#O0T z)}^&bRjd|wqzaE#D!zh6g|cXovWN>75aFF0W3}u1=Y8LsB;Pr6=FFV!H)n#-|940N zd}K^;VOpR*>IBL}3?!dSQy2^;&}ji95WyiiLJH>)IF1WBJPyI3;0IDz*8>gF8x4}r z0UZvPFT`<*3wsCx2%qFq6e$#Xdg4MpP6)XIK2IQ|NSxq%3O)HkK9{8UOvHwBhW^14 zZI|MB5=>+XtN1@zA#ee>2X5h9Fw3JI=1@#zg;f3~$WP)U!*r6NNGxTcMPZ>)p)t{k zEM4sW41`$70Qsm)P=HE4KR}5=uq=@IsD>hmR2eW`D#0|t$^e;IIZGp+86Xa1fmaeS zz0V4%Wi%c7U$VtQHvf}su@I)SJW5A#od;YU4TiXBk3ztICh}3UP%H|R(~=An97I!Z zcA(r8@jQ4@QCM)~g2mAotbvSPvv5?Y*(fq5K0Xwm2#Lb}Xon3p=^G{l%7#yHkSX#}@NnGS zNA1ofa5mJFuEy_WxVE-zB!U_uz}iBf`e>F^h1uxNivZNxASaNUAT1;@nOq{ZLBU9( zVj&&}t}6^1K!F5y5)=+Q(}6cZB(7ToNrqMesr*eJ(_$kuCN4ZOWu?YN--dV`0@0kzD9GbSyf1v_Ev@Y)h3t3#ruNAc0z(Pt;$TPs>vRO#MB$pZCHf2IPdZ>n1PEawxeP;MwgmG3=2RD5j?Mvfq2CVh ziUFKHAk-Ae1dHk`%K za^}D>wVWAndRmUQgfUd0hj+W6VDO8HDBJ}pfCKbH&@M=+_12xhNeX5vtquX4S{sl5 zH1Nxj#v@L+K(~{q*oey&a`-|45FSpDo}@?@qbX1kGKN~{BXN*CsDAYud|cVbK=rHH zNIv6Trvz2#IjG*$5M=TJ1#+b&^cze6-i|-a8QU3*0VSgC=ph4reZ&~hAS=|4pFOhdODb^w`h?e_9crJ3b3`&sXC7q9iS#(Jkj8kO~JdR#4BQ{IyM{h;Ie z<;8IcnVU8b#>SB2r`Q+nTVzs9ZRtDs{LGIJyDt_M+SP0-`7mEFxQwEZw1ixXHKV%|X3xpmAZzRUIdSci8y}hddMY{d^2i54$Lc?gJ5;-D zeA;}8WA46=bn93DGJdoyY1ELC%gsjHQ{uBGez3NJj0sT~-#O=6EK_gI>s;q&U?g)i z8JD*Ea!JUco?b`pyt_7e6Xi{FN?i8M?>%i_U07zVmNjlpBKDPYMWu%Qf@!-KsAszBWt#=vxK4w;(ey;#=ja1LijQPs{CzB^zZ6 zt)6u@?GoJBeMH%*~+Ti_(zv!=a8n$&xLLK=BDkJp%yHA_w7fnwtH>KEFS(?b9MM)hg(JoSV8}i zC-KPdc|l^==i@Jr>nP4peezV!$+_t;Q)Kj2>=|>(gzl1rf|dTgb*{O$)T`!QHNNQU z{yJqxRKBq}o?&t@!Xf8kKue00Se8@As(vb`(o{}nYnM}Zw~mhKsCpgQ_+x2z=z*Vl zByQ~T_|*PcQoH7)M*Zv)wpOn)g-@o>y1zer-K0kk z!Y=fUn{albVeY*mH`_h?J2U=mxc7%$HNL|SyY?*)k59Hdc6jXZCa;>{-u*Q^D{Q1@ zQRH-vvSiS$JE{v&>lcr>-&`$!Hn7HSng5EcO4m_|vwEH-sXzTi;pkwLnp>S$DGITv z#Wrj;Z&{WfP=9&Uh=^lOtfD@*$DL2@kBm2Ooa52pkv@T6uPN$wDVWzcu@0gKg+1_QNUk|ZN_l?pFZcO-~r8b@= z@i^L+-e&RO;jpYbxW^Mg!}I=?UUgn zo$E`*lEB`hGqYXA%(}_A>Ie4cFDjUJZIT83`t5UrWe8wB4QdV%xzykC&0RZ?3IL wxS`iz>||8B!lh|rP1~=;SFAH}J7T*^R2}QHy62h9NPFArX*y8z+yBu209sd47ytkO literal 0 HcmV?d00001 diff --git a/tests/features/test_dicom.py b/tests/features/test_dicom.py index e3263983b8f..5eed8efac7c 100644 --- a/tests/features/test_dicom.py +++ b/tests/features/test_dicom.py @@ -27,7 +27,7 @@ def test_dicom_feature_encode_example(tmp_path, build_example): from pydicom import examples # Save example DICOM to temp file - dicom_path = str(tmp_path / "test_dicom.dcm") + dicom_path = str(tmp_path / "test_example_dicom.dcm") ds = examples.ct ds.save_as(dicom_path, write_like_original=False) @@ -46,7 +46,7 @@ def test_dataset_with_dicom_feature(tmp_path): from pydicom import examples # Save example DICOM to temp file - dicom_path = str(tmp_path / "test_dicom.dcm") + dicom_path = str(tmp_path / "test_example_dicom.dcm") ds = examples.mr ds.save_as(dicom_path, write_like_original=False) @@ -75,13 +75,36 @@ def test_dataset_with_dicom_feature(tmp_path): assert isinstance(item["dicom"], pydicom.dataset.FileDataset) +@require_pydicom +def test_dataset_cast_dicom_column(shared_datadir): + """Test the example from the Dicom docstring using shared_datadir""" + import pydicom + + # File take from: https://github.com/robyoung/dicom-test-files/blob/master/data/pydicom/693_J2KI.dcm + dicom_path = str(shared_datadir / "test_dicom_693_J2KI.dcm") + + # Test with decode=True (default) + ds = Dataset.from_dict({"dicom": [dicom_path]}).cast_column("dicom", Dicom()) + assert ds.features["dicom"] == Dicom(decode=True, id=None) + assert isinstance(ds[0]["dicom"], pydicom.dataset.FileDataset) + + # Test with decode=False + ds = ds.cast_column("dicom", Dicom(decode=False)) + assert ds.features["dicom"] == Dicom(decode=False, id=None) + decoded_item = ds[0]["dicom"] + assert isinstance(decoded_item, dict) + assert decoded_item.keys() == {"bytes", "path"} + assert decoded_item["path"] == dicom_path + assert decoded_item["bytes"] is None + + @require_pydicom def test_encode_pydicom_dataset(tmp_path): import pydicom from pydicom import examples # Save example DICOM to temp file - dicom_path = str(tmp_path / "test_dicom.dcm") + dicom_path = str(tmp_path / "test_example_dicom.dcm") ds = examples.rt_ss ds.save_as(dicom_path, write_like_original=False) From 4b3ca52912d3f60118c282f1aa41f458abe8d496 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 28 Oct 2025 07:03:52 +0100 Subject: [PATCH 04/10] refactor commenting --- src/datasets/features/dicom.py | 3 --- tests/features/test_dicom.py | 10 +++------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/datasets/features/dicom.py b/src/datasets/features/dicom.py index 072a36576b5..55e7c4f0fbe 100644 --- a/src/datasets/features/dicom.py +++ b/src/datasets/features/dicom.py @@ -94,14 +94,11 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "pydicom.File elif isinstance(value, (bytes, bytearray)): return {"path": None, "bytes": value} elif pydicom is not None and isinstance(value, pydicom.dataset.FileDataset): - # pydicom FileDataset object - try to get path or convert to bytes return encode_pydicom_dataset(value) elif isinstance(value, dict): if value.get("path") is not None and os.path.isfile(value["path"]): - # we set "bytes": None to not duplicate the data if they're already available locally return {"bytes": None, "path": value.get("path")} elif value.get("bytes") is not None or value.get("path") is not None: - # store the dicom bytes, and path is used to infer the format using the file extension return {"bytes": value.get("bytes"), "path": value.get("path")} else: raise ValueError( diff --git a/tests/features/test_dicom.py b/tests/features/test_dicom.py index 5eed8efac7c..16ea0b27900 100644 --- a/tests/features/test_dicom.py +++ b/tests/features/test_dicom.py @@ -26,7 +26,6 @@ def test_dicom_feature_encode_example(tmp_path, build_example): import pydicom from pydicom import examples - # Save example DICOM to temp file dicom_path = str(tmp_path / "test_example_dicom.dcm") ds = examples.ct ds.save_as(dicom_path, write_like_original=False) @@ -45,7 +44,6 @@ def test_dataset_with_dicom_feature(tmp_path): import pydicom from pydicom import examples - # Save example DICOM to temp file dicom_path = str(tmp_path / "test_example_dicom.dcm") ds = examples.mr ds.save_as(dicom_path, write_like_original=False) @@ -83,12 +81,12 @@ def test_dataset_cast_dicom_column(shared_datadir): # File take from: https://github.com/robyoung/dicom-test-files/blob/master/data/pydicom/693_J2KI.dcm dicom_path = str(shared_datadir / "test_dicom_693_J2KI.dcm") - # Test with decode=True (default) + # decode=True (default) ds = Dataset.from_dict({"dicom": [dicom_path]}).cast_column("dicom", Dicom()) assert ds.features["dicom"] == Dicom(decode=True, id=None) assert isinstance(ds[0]["dicom"], pydicom.dataset.FileDataset) - # Test with decode=False + # decode=False ds = ds.cast_column("dicom", Dicom(decode=False)) assert ds.features["dicom"] == Dicom(decode=False, id=None) decoded_item = ds[0]["dicom"] @@ -103,12 +101,10 @@ def test_encode_pydicom_dataset(tmp_path): import pydicom from pydicom import examples - # Save example DICOM to temp file dicom_path = str(tmp_path / "test_example_dicom.dcm") ds = examples.rt_ss ds.save_as(dicom_path, write_like_original=False) - # Load and encode img = pydicom.dcmread(dicom_path) encoded_example = encode_pydicom_dataset(img) dicom = Dicom() @@ -118,7 +114,7 @@ def test_encode_pydicom_dataset(tmp_path): decoded_example = dicom.decode_example(encoded_example) assert isinstance(decoded_example, pydicom.dataset.FileDataset) - # test bytes only (when no filename) + # test bytes only img.filename = None encoded_example_bytes = encode_pydicom_dataset(img) assert encoded_example_bytes["bytes"] is not None From 97d5359ef18195c081dfaae0b02b7bc6b8178701 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 28 Oct 2025 11:16:48 +0100 Subject: [PATCH 05/10] add test to allow force --- src/datasets/features/dicom.py | 10 +++++++--- tests/features/data/test_dicom_no_meta.dcm | Bin 0 -> 38871 bytes tests/features/test_dicom.py | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 tests/features/data/test_dicom_no_meta.dcm diff --git a/src/datasets/features/dicom.py b/src/datasets/features/dicom.py index 55e7c4f0fbe..f68c74eccfe 100644 --- a/src/datasets/features/dicom.py +++ b/src/datasets/features/dicom.py @@ -44,6 +44,9 @@ class Dicom: decode (`bool`, defaults to `True`): Whether to decode the DICOM data. If `False`, returns the underlying dictionary in the format `{"path": dicom_path, "bytes": dicom_bytes}`. + force (`bool`, defaults to `False`): + Force reading files missing DICOM File Meta Information header or 'DICM' prefix. + Passed to `pydicom.dcmread(force=...)`. Examples: @@ -62,6 +65,7 @@ class Dicom: """ decode: bool = True + force: bool = False id: Optional[str] = field(default=None, repr=False) # Automatically constructed @@ -146,7 +150,7 @@ def decode_example( raise ValueError(f"A dicom should have one of 'path' or 'bytes' but both are None in {value}.") else: if is_local_path(path): - dicom = pydicom.dcmread(path) + dicom = pydicom.dcmread(path, force=self.force) else: source_url = path.split("::")[-1] pattern = ( @@ -161,10 +165,10 @@ def decode_example( token = None download_config = DownloadConfig(token=token) with xopen(path, "rb", download_config=download_config) as f: - dicom = pydicom.dcmread(f) + dicom = pydicom.dcmread(f, force=self.force) else: bytesio = BytesIO(bytes_) - dicom = pydicom.dcmread(bytesio) + dicom = pydicom.dcmread(bytesio, force=self.force) return dicom diff --git a/tests/features/data/test_dicom_no_meta.dcm b/tests/features/data/test_dicom_no_meta.dcm new file mode 100644 index 0000000000000000000000000000000000000000..3eb05a6a53c9fa6647f08ed55bcfbd16a1975591 GIT binary patch literal 38871 zcmeFacUTtJy7xb8X4X7_il|6YQLrJ3^lC>GOH@=eiN?hA-gbJjQ%u?ErWm7%iLoU1 zuCW*F4Y4bB0TFuvR4l0P=N{aioV@2;*YExJcWvi-V4j&ZYpuI|@3PilsF$IQ^ei8v zTUO@aZhb>zV`97vFRla_nSHx;>DDW)$DrPQyY)=#J77@S{oT@fco{W~PHA37Tue+t zOl)kbmr={;*V8d#lH-!&W4#Q2qkp%WMr>4kR8mxIR05yG)TFqmcrPP>J9W4d7nPC_ z6U9I=DN%`c+HjRogJ==+e?J>bqsrsogSvy!M_hcIJQw=68HKsuV^dR;V-n(_r{Uke zH~QQ6nE#@H*0Jul*x0z_1gMMq_qWveBrhZW&bN3b?`8Bhvbq|K|1kVAjr4wA#>+;J z%$i1*&Y?X!cj}g&)+01)KvuubJ+r)wA?}^-8TW+3O!(R*G`(}Lex3Vz8Q&Sbdr2?K z%P4|`gTq0ot4?Cv$x?# z-&Gm4XV&0uojUg#924tpDBAdPZE%;)y*g)g%Nm@K5$bK2^lH1kdK)(9RgJFCWWD^v z6OV_c_lthU+X(l|>R}u6BcF=*W<;UR+lamMyO`gm5tHa`obc;UtDX(5R^G;`yHEHr zkKVlkjP#dYc=?qlUVZh6$Fm-J;pOL_c=e(5ezBo`NJ%J&<%^#Yg9HrPcW^nE$z5jF z$1oVsxOLmeugCAoC%lZbj_I9{bb=ppOwY27ex2{{_xCn(U)ue(4fF9!qdkf?fD!#V zA%&PIzVXk`m>t%mpJCKsl)Jp^oz|}#yyKc*7Vh%q-X)Cz)PjQ<52{P9^8LQbtG~&Ea69k^BAM@(wl{ z^WDdYrMr}b#AJD*i94n;f{mswy(kpt;qG%#=l%;en%{XYB`OB;;*x@m7F?-n#HXev zM8&11#>XbnSESoRb~EWA%Dvw*IkxhVM7baB_L7>CaJS!7cXU76ceu-iw7WCvc=uZR z-Lp>g;qv{?%(~od-Q7KN=SH!!1MW3YR!T>2k& z&*ymL`XlanGe^!J{Y#r?-Ck_2KIfkGvx1G6-813lKkkn4iaX=~lF6%o`~KSBzQ6vr z?{EC=`+>VXAN<>$Lw83%eE00ge`^zJRKt5k{!iX34)66pdah8zmuI5>r)T0_kN3Yk z<%i#?MAA?YtQ%Mr8ybpVma{m{YTdmS2hub&TE-@(M8(D@ro_Y#Y8jiH8Wj_tn4FqC zsAY2E9Xtp%l4+SFR_Fd3^lzsA4=4Xu8%FH=ztJH9!2e3U_)WtIy@PpKy+VyNcsiA7iYt9P_ecX*!oU*2ZPN%d^c`OnW_Vm* z6%oB)aF|isuRr&F9||)nkwIDp-EA4{wv2mnMP!(f?Dpix{qCTG!SChI99>$9*P}@o4Fu%h*ls;Gvv>b9ZLcL@2v^9_*gWT$_M%;@89fI+lba zj1?|p@r$_J$d(IoU=O4P~p?6!v+!-m+y<*Z1EFy;J z78;JEg?ImaderK0qoaEsA0NLe+(^H3UNMD5Y17+1-?HUh#_i*tUwP^z z=O7-l@4@V|+_N;!`rW0d|DCgDXrSnsGMX90T3~t4%wC!4UHfMC>>L-`48F_VA>6HT z_l$Tbt0vJ&Qc`R}QVK>_c|vPLNozk^tGj2WdsdxmmU|{6g(}hgn5)g0Wv49S$k>>; z1fI|0o8Uvlf0uV}(X9xH0q^KY1UB>6m>~jP`0w8;fd#elFef^m0R zU?=qQ5c9xFm@#gOzUy+;M^x|5zjUz(+R432duYa4WrPrS2SL;C*n~WiPJH)QUZxvi zcb}Iip(?r}RGu}@ndi-Pc-Wt7h~Blt${vW!$qu3c-&XLenO|l{gP4=P$q@MU*D#(q zGA8kzR&C#vC;wmnr5h1$-~L|ilPY~%Uw<=?{G|B*wWt5I{|^WL!-4;B;6EJr4+s9k zf&ag7Af1YZ#2!SZ|DR)zbZQz|)D|RGlYi;d6gy@46QjDRp@d+mQI*A{&~$PHiPqon zk4sO#lO=ro@r#|)$rAo6hNNn7H=gX|o=eV>PL?3oE6+3BbBat{r+x$x>OZcxg z-Q4raIaO9U6s;?(9FO*KGla^kz1%ql{~D2&|AC*8DM9fN!Ijorpsy4F&}O@_%$UdZoyG>* z>@#i{$7r9!m0iX;<3aU+nQE=DLTuaivFq7YY=66!ebzc=?X>1uldUm)He08yqt-#I z(7IsNuv6@Q_A~ZKdy+lY{>h$c&$lPrqwEp(1be7G#-3>ZX8+2!dG>6+jpH-M{@i|> z&*%30_Luf|_K!UEH9tSF@3Hq=*Ufj#e3h&Y8KaHi#;?$pYb-auH$LUtQsXf5+{T=j z7z?@o1E0~x@5U-)2eV%VT?>rG##)!goyI13uz~jHj5zgxx?er2x~f2R);PfZ(?&H_ zMV&Ja!nITIYX@A}X5<+cjkneJW|lS8QmBwO95DDO`-FAK+HLKy_E-z8-{H3C3JwFSGmI7-o!zw$0Eq-&kQ>Fcu;IY201Tx0UYv zvyF{t!Fp)UGYXOIIykWbuIxa%6^6fxRK3)_d{R_>6~WoVs;6qLLRGNxQOAr!JiQTa zoHf2wE6qosvCz6?8TJ{gx*ciPh7*_he$G0M7Ob=OTY1(dYYiM(Zym9M?9TQp$nrP) zPpJIS9s(_+;lpBR|AS+q`>VE$qg2 zGCw2iaJ!~$*|*?Csa1fEEJFUft(tZx`+56c_B4AD6n~4X$0GZmu!kx9UTrT$@{5r2 zSo;(ELo8slJq!+wheK1jHW@A~pv_F$e_`Ke54Sd((Pq5*#rPVES0VL}u<|ibwF27y zFn%_s7?=3A#MOixV-06(X}6TKIkcE=tTu|#fMV?W3i3RPc5Fu<&KbcVKtGkFuJL@C zQA*Nx3%+b~@8%f=aPf-KNky7z)~C?+CvrY(owR)HV7tDZ!O`CCV|TSv?Y4FU`zHFa zz}jLRrd^qpXg`F;i|$Y2n8BIY|F5nM%!m5fP$-%{41O$t-tnA|;r!8`oY^}yY*D>Q7w0(u;FG7>|BLC6QKZm*Jf>ryWZ3EwC(NgSTsmr5H#(Be0 zLU}Q5a=A8(yYtbbi$+U05TedOeF-*nnL3A~PM{+)P6-;|QD3R)=4Z%$wRI862ihTa zJEYqd`tHTbUq*h<*bl&kP`lLHX05WatqCAN6FU>F{}PI3+RN19orvbe96yA&@dH^ zp8`e4plBn=RR&KsAotbSz((|ADLgq(rRFSFk&OjRz;-6XnYlc(2-NdawUiCTd(nr( z;NErPrf~v3oq;!ne4bV*W=|~o6u4d+Z4b1gnbC{(XLh!|-9BPh(a~DzNyuq{orGr) z6j+50_}e|8W(Kky&F`_`o#?~4!fFAo zy=JeqZQVvct4|Am}nD-5G9bsH~BkgzXG<%x0!91hFI37e=Cz$mZ^nNBZOh@}i z@^dZn6Kmbg?;U9P0Vuj|Ts5?+%VDcyTwM+NZ-qN6Ia>{n1S5BX2u)OgD(6fPVJ+9L z8nvKb!RH2Qj#_9Ax2{={$ks->U7_H*{juJrkL#oQWqr*qv47B4bg^EcpVY_f!B|0s zRcxKH8rmI^vv~MXP(Kli9?kg_{J|vTyaZYoLGuKzjNzIbbI}E{`Wf(OELt&*_TSiV z;s5?=eQf^0oT@>K5B6!PB4#=n%D*(e;pb8iYd5pchPHLkA=Z1{XhG|8<0NuCgdVKI z1004cdB}M)?S&H?i4F=4UsV$hm4gn)!2DBK{SBjvs-fc5IQ2BTzm0e%%uWR9zOZ-N z8G5xoqSxtVdbsYRz4T@~M0eM3>J8f8*`&|d&)P9WNG1_dWAx!iBsh$C<~#IXuuYI~ zG?*`lFaALM-KXfn0`y=MTv}!?L}zC6do4OJ5?y=H)^;PSvH2KY=`cqbUaO(mb?~;uft=VM*2BWw*~5o(Bn&J^FC~OFOpw^hVSQlF6V+2yTOWWJeg;d6Ze?# z^cLJX=z7O9(BDA4sA`$V&5?MvFr=D>)_Utn+T&DlzSb$ar>?0@9jR~G=j~PYVY?qX zQQeuNef1z(2Y?fi;Noy-{1Lo+7b_4S_XR%T8{&cq&^Z-}FN7yUu!1Fg#-sbo;6#qS z8xOgHV~qV0TC~&JYZj{Z>MDp_ZrIRX6D!yb%C8{SnG6C>fufa2t{mBIK?_#kA=g3a zQuwn1`cFaW3g}*l9xR1#^JtSx%lYVx*hz@0?rOkZ@bDm9tU!03P{-77<{&GVh^HPh zeZ@XzkJiUI;m7cuRRpxa33 z9!Hx`pnW)+@I6#dh99CE!?BWK=*2XyjfVQA9HRFl;msJdDajsZeP>Qq&6J z#Ahv*k#8sHu6m??Ur)6gL4OdQ`w@KFmr(N!^iK!<#lqz?6Abv3L-;Wc$&UsN=3)U7 z_XsNd2Dg64x2D=xtY%gfv!gnU=iUMx8<5ZjxUd}RMnK&fXkCG1v(ffs)ft;V!~JFG z`gW+?NQ?Qj$%R(Y`vdT9D>^S&cZ8mfVg*H5{wY3X_&i_i-pjT9N$PDg)cVfaPJHXw zkJ*8GmR_P?&|UN;d#k;{MXpKs^;~Q?8|==33)exzL3)$!tB0TgSBL@Hf?pqCIln{W zFKGI==!4KU7DSloO8j?do{cSJBk%E0KO0Ug05@0B=1J&ZV2v{;su*>SS?`5v$z@hS z?@W%dXz~_5i=lTh(OgSn`8McsE>@9^hDZdwg3oS_T&SN7?k_`cCKCUw0qu(6KoJOd z9{Tsw|31+20x@8HawF3`ZT?^tS+?Dh=w_4sFP*2K(}8&VlhC>koiF6K%=%5}dJlch zv3J<}?WX!GJzqD|nZ$zzIM|yUW+W5|>J8&~1C96*nM-D{09vP_4I=Y-*wa!lZ-%{w z^U+-Y#O^{wRnMv*PYqL-pmZ&=o&yioK+7_|??Q%!NO=(!@H_LrWHcv(X-|xF08P(B zy16dyZ{*v0aB>mS-v$pPt33k7Ra1f(`&``L=0*UkK+jU5;`h~Hvm24jRXl%h^t2wb zU#MfSdSB#kfN~q{1NI(pYA$!jqq)<;uGQ$lO}mNyS+CSriIa{I2e(CwzXbt@K=(wb z`W|^N=9q~-Om=O3k-ZT58ZhUU-1B9`C{ zKfzxN#)3YuhMPaBmdMGQPYK@dAQW$O{n$xEsoThN9`alP{Zp{?U$E*V>Kd=9{%~vu zIIt4A7s7{~v^kEeE z+YjkabZxyA?OY7a7wrqobrBl8$KGof*yrp#W}eM?F4CQ0TlzJ<1{;XB4_a5Oc(R0N zk?0@p=TBNnP2?9oQ{dSGI58TzFF=zP!=q8O`kYwWw7;-+nO9X7*@;A>CjR*(TDuGS z^T^74RkUh~Puz`+7vc%Vp%V+Z(-!*vP!q{%qmgqt@qIk{W6(#qdQR<7#p)B4K?HFI zzKsJLS8<%e7EHK#08SMU|7U>xoyg@cStefWS3818FIndy?4nOfL5smjpi5!#jE%}oo5!)<1A?Z z3cSp+FIjI{MP|795_7)P_{z8qcQH zbzz3dx&t0Rmsx+xOcx^eqe!~I_UH=kjz@BWe22(~#D_2DTY=q|Sm`JglG#*8JVb*L z*z+4;;aBkC3;UnopX9~MiFv+5>J!{3LE@x0$u9G)w=F;GDRUaOdWERQ;-^Tk2u@Tb z`cW!Qy`?@NZ{GpU5(mwP3p1$%%t9|dR(Wb4Qtk?edQsJgrCp}#qn_v5eMsI>7eT;W z*Jmt;2TOQ1ar_tg1=zk-A`r(%kp#P;(`QRi$N0c_!f3b6jb z?|aDSK0?EjbUFHc9Q|L8{{95L$Lves);5mqb{QD99Nb!pPROx^vsLyDyPKY-x7o#1 zghul@gErM6CLTat@N=@-pOE`d7XyCfr)2qAc$#wSTdT3P#%yOU1H%Hb+p5eqg8Eb- z)D%O_MXa^CnyJoEm+(=GiR)&8a=GB!Sp54Eqb>-SOt#n`Y}m@3g?Pr9c#d7fH|L2F zs=>)-9HDUa7QGcf{S|nzhkN<7UP5K`ezKm$M12kH{@AFWepXi}&sm4&OU>pdC@+Ri z@pT@397@l?kG*JsRBdL^Y7_M5arYnEPuI0KT63*w^mhOrT(=sb2YvVq1Q9+5=|4du zo`Lod+p!N>Z(0qlIcA{wC8*^K>RpAN4q#v}zK4M6Cy`rq^{MhTeau(5zYB>S#&7L` z{ta$qi`AP$sg}JBzoe!i)xcfgVjg^xiryZ0vL9=&0uQRf0f`ZIf&jVjK_bgz#%Q(G ze2W^M4>hem$Tdud>J=c}n?N|u%eFLihfHoh%S~h|a z59)vFkL;sVg-2U+u!EK8R8{iA7^3=2G~sP{&>!y*Nd=Z^B0Z^9OO1D}%L%PktL>)0^*Z!lLYE~{*a;t` zwk7%Fc3Mj8um@R}<5T?LvOktmfE5S=Y&Qaxm)Q{OzlimRf>YP+_T)Gdp=&NQ4}$|E zunD1F_))-djVjPu>|iZ=FHw!;{Nv!#6uiI$eGX5SYmKv}vqrFp+M%|qQ9F(&H%PG4 z?QUcWet3yt)}vNUYrXkAb^a$*Q|!G7{wm6?PQI+>sSq>NT!6h7Be{2#w;5xuW#&7e zP-t8V{;fybcOt!y)Kasp^{)A(I)|h%I9U=z2hWt-M_g?5<21>5P4;F)d zbEu7dh1dL@IN<}`UbnT6<4f0ATd5tNBqsJ{?Ig@@$PrC!eaQOU>SfijPMhDGwTX86 zK({|w)s5pJeEB@Q*(nummZ-N?V>GshTBpKHkNFn7*ox%WVE?nhz@un+KK|=j^{jcu zyw`jI4Jw8^hoE%{6mR8xKf1pLP1%Aq?Bz}|Ud0c;auvFdpdlyV!Jq1sIf~ar9ZF4N5juYkoiBrSiQ!Cr1PokFTu_WYECSiy!N*PKe2{)ikFn2L z8#vArP1xjxz8ob)md9B$7-zM!j+#H4@0s1q6GUfmF4hO)rJ7*x@vi3ou13L!9#Eg6 z2C0$6rU%tr)+q{+t=RW$WV#w_ImrA^8!76r$~1422_A&*$w*wP;3qipLAfOyo85L7 z$oLMRFBMq9WuD!I9i24ltF~q)lDbX(E)M}VIJsa)cMIEhzPYJyr01G~`;z9b?!TMWR z^bpkvly6Kt-hyne1^&t+N>9eh>tf0KIlPE$N2#CDgExq9+k^g_-K>5b`P?F8c?g;A zFrw6-*he-xeHhy3;TJa|*-%waUF7~^tmQnE=fREroNq#Ju7Uqs;ma2I^S)Yd{*x+y zfmLoLApbGM^3Cx6xoG|hbo?Mbe+mBm7}DQrm)jM{IM=?6mYl`vm*5Z2+E>8-86d|O zc=bH{6{=(1$pqF}OIagZNNso%Yi4t-cdT00mu8uo1~r|jHraT%7DzpaPeU@eeB^Nv zeUw$GHOTBVHJN{^cZp<@R9iUl0P*`aYJcClYd%|<`(dP6RXw6ssC@7;4O=;Zg!e%8 zIioTB3RmS&?J(9c>?IptC-uTj+%2QkMzkVBbu{}h$0ODaD;VkrfpTq#`^%tK;=bij z>ZNx;{~mnYa{DCuUWk>OWQO&1K8U{`L^y#y6p|5*#2>7tYSK?nw!Q4btdved->0xT z@CEVRz1AvoJ9F}=Q(%wadLWz0i*rSgk8# z&ZD7zE!JC=n6Q9Y|6w&kO~qF{3x*uyX;}wYO}nZ*F^!e32=}-^@9T)17NZ|?;M;ku zU-V*%+GBo*r{0U622s0s7}?g-tI_VY)TbmjEyb&EC;D3h&K*b7&-2rp8sHTW;kI2v zp8^^8(*79uxRWez2GK)`-T>C^v6evlIBOzYcpvK9SYMcLsH2RYk9Vq#-Am=k7rmEk zufmmm89#3r-gtqw9mGGo(F@56FH;Z5N0;^x1x>~V zmLmTlAbzCvIcQ&jUZ2M^Ujm1&x?Z{fY3AY0R^hEK8MmO`hbTA({cVnJ97Sh$Q?G1? zhZIyeVQBQZG2gDUO1nmV!M6&uJq=r`s&2r&0_c~O&ur}HUUh{t$#KuZvzut0LF>zm zd(~)8)azxf1^27qkpuC4kKn@^=vK%$4~jQ~^Eps`h0XgOYCCow{89flTo z^d+QUa>qNAU>(cQj3(OAuT#mHPZT(h&mYzg*2`8;Yn{0YjTf5~&m?#xet9Fb&qu3g zp#7p5mpP(Ah=w49#Aoy1OECN})eg|kgHE)?iWGRU6JHsJ73>4gT4FDu>NfmZihb;G z@x2rJUrlX-yHZKqLGS0l{#*2M175tNzBRuj=II6H&1?^!2SBtq{UG>Xi#oqpeJS3+ zPX`bUUq%ylAp1+8!Wa04!}cwZzuc~?o5P6)#0-JDg1EQEg`m@BN3X6RTxl7$6j9S(eH=YfEsvsTgO1c5kpmH@_(&)lIDCnw=s&KQG`8;)i#l~-7h^+l(b8MjPMV{V) zq<16fe9o&-_lSd!VqwRz92>-`kB*#WMMayzg+b0mgb|OlNr*3!7ep>&7il;>c zv;)n*<|`Kb@Cp^Ri|ipNCK}p|HMB9Kk;pz|w2v8=q65dFu7>N) zW>Mi^hbB}5wTj)CMJnI(L54y!-y1aBPb3^~wl<5^NBDy)Sh`dLmT`zTlN|x`=|Sq> zg~jT%OZZlSaEz1Vb3 zwE7aVl{()}j%oJOR4ra7+TV?zOQJ}yjM zx_}PkqH*WVp-4$~=14|z9C^<|zRRIAo0u+}D|O(-PVVkPk{i)$S<}x&Q~dE~M^%m* zttJx@FHrBn#}(`k`o-N5unv3ABf44UTGR&4W?~r^;H3rqyP#b1M`^T5q(NPRDwb`tMmtM#hH{FRveKBARTNOB>l^(HHRiNx?5>{xe|KL7;q zBgSijZr8>Z{Lq6K;<_O0dOMNOdhFp0SzryFsN-~JoesYqX0;>(PWkBj^%$Objy2I* z^m~=qvIY5H0lQ^RlW80<-&0$lf4wVN!9VeF3(?sd*k~!y%6=^P78)-rU|aEIB}O%RJJn(Kc)v&o-5uMq?Q)TG-CA9t!`jd-h70~koY6QB! z!-^tf`x=Q4L7E$&yf!=BUO~pau-zE^S54&E7}*9O*?aUadWW9NI$tvJjs@l&!y7i# zx3T#a=zbIIAd*$WIDXf`BGUDH=>PNVhe)%VBYi)rD)+E^tOO(|vR<*$%m}p$eH5>> zo%za6j3d-O{E2spIL}6M+nBGcdtNeXA?+uraQ{Fi(3&XkJakG7>kT4ohoUoZ;~H2m zadjScBRSBYJbwV`Uk6RfT-z_KKC|N{{ZS!SM6WG{|weMlUeW2((h=~Y3}SqC+1NXZVRFfK$1`4 z+dlvm@6%uEhwv43v_Ek|U4GWr^{|xoAjep$TruF@X(V2R21J1bx8VC_qU~?3&&-FB zqu|m`^jm6`yOEddzd6J=!Llt}_rlg|sH<2)9i)2(j#%VF)xa}Z=iF>OqmEe)?M{hp6qWU)#d2|Col z8tQ_YvHD9iEXdx(9>gKLO{(K2KxmWl$p=koLIDrL-4ZNy8gZhhBE#&qVI{z3pIRJgm zq!Qhib&4b;ddhmn-lmhBgF06~%);~V#42a zrjz32vR<05U()0Bi^R_#QBAsKop5){UL(hisTo zM56H%^A=gL6+2*pLNiLcx>8_@2#<)r*&PYgL7SP)XHh*yhr_`Iof`Ji01^g zg%jw%3FjKaudA#GTto{tQ~f=NT=TH|^+;?zMHQV+hs{TI~h=A%fWav$S|tUz=}I zj5>T3cKSu){VgJu(e|%8&}rawa-IhDmV)3-T-^4D%0zh8j^8bytr`0wW2xnO!HX6k z!*DxLe~i@>>za6qiR|B8OhxTH)>fTe^M!D}D%=i(%NMAE9;V*3mz;2gIb7X@nq$mf zymSF)=fLdC$o#Zx5wbf`c0pZ)y3NomRWD!A(3=QxE|T8i+V?4*y9niLxGz3l_C-qd zTh{ol7%!<8%x9QO2J29tfq9>jzdj0{C4$HfS$Y-L-Iaaax@@(kzV)%5qnGPFP|}n2 z{06S9k5Ru2#E)6D41|hExKfATO`tyjn&R0VI*oiNSohWgh!mUJzmebdy_KAAv7I;H@@KHaqjMg+SORY zSu&xz_(a(cv=rnMyOF(ZQr}w3v6W*Z=y?>5ZihQxt53|w7(EWXe~@`TX5Ww1huAmV znVo09hr%!KXa8;$^5E-sSJ3MbR&jm;^#>x`#?Tpz#G_qn58zDnzA3be^~=*W>8%|R z#B6fH^Hc(tF~ULez1<-FGIakmns){ZD0TH-xU!bJL#@xK3EVWVQ~fMK>StWaZoLr36EC%h4BI$kbLTZSz7iC7 z)HmkqAWjCfz3T2?PD5hXc!Oa(QaMRHa*KJDK}8IjvBvhnn|C1Yc>!%cz?FemepjNm zH1IJ2x}!nE6tq9o{Um^!4P9Mm45p5yB0e8W_!w*{f%aW!Kp}YRfO#IY#3m9fK?`n^ z4IYLwv#E$rwn8m`vll+S1Q~Bf!$r=rhhrVo$m&!+JP_R%%|6PxXi7bzp+-tPL=o05 z=qZ)(O&qdYLf#BHLw%<{E5@76Da78LiA8#`^3TqO=b*S8$}5P?z0v3HM8Y|4 zM$aA&VwnLT`LA3}Km#&(QZzmZ%xKQ{By=K#Rjk_ZEC#En%Mn2=`4aX2?}?2DF;)Z| z*iX)T!qQmHW%yhbZ=kqZ9c}VP`%a+;2jS5$>#TWAbyOFSmPF~Ye={H3-2>GHpui3A z>moRp2VL7hMycQUfB@xa!*;I9o@dz!yB7bDgV)>2^RgGVm#Sh~>_kXqy)hBA3U)nE zHTxp6&%qwOnNI-H?MDXiDf2#M`%&cyA@)lrifgG$!H7mY6X0T9UF<%V_^2%!(9o`m z?#qtXTJTcdkGV>3DOlC#L_}}nTL!`JrtAu?4nEZ3$?N2OSHQg!e5&!>9=9sGjM!)! zwXs*ME2gF{l!M*wbt#w3Xfx6-!x{qB4Sbl?ZjZVaaSX0pfh#M?{N~{y=AbDXXf1o* zkHQVf1dp*M*41nc|Jz{mZJ9$geEVta;41oFhBaP93#-DJcR>6lVAo2hZ%*V|i#YB9 zqWW~sK65QU2rJj`E4+Z^R*(9C0pAl>Jp)fO;Z=3`Q-hzP z{|=lo>7zNi7(ynzf?bvyuz_8C##rxKrDl-XU7bL}m6Dcq-lNb_1_Bmg)q-r26|Urd zIqTs~z{EwgUXJ7?ODcjR)rbaegLrkwa2J~25^KvGtJ){AIpKajap*}XKhLK+KI;2$NC}^?@ zORun!Kntl2J%HVGq*XPJhVbSvch6F72!L;0p+C?*4eh++#mbm38d8dfn82#wt5im0 z--|-xCFsEcXg!4`pX5w(#lujU&)r63ftkdBvU~9WAKBX{dEXxFAm3=iT38)COb7Ha zk})e>sUKmCJ;WT#nd=#=4s(>%?K!OWjbfd192`hyAMYIYbgyOK+aY$nE+q3?Onm&X zyI!Ay)Y~HcFwnjlGWEdMs`MTJg@s5k8?9=^`oREbf112%AW!wex_V+Y4Y8GQclBGm z$!%nC0{O2&8avQQAGE}S2Re*5$Tbg;)d#2wyqtK1o5XMxQ0!psSBPZfU6vhSf~CS# zDS9vU(Ht;Oc1g*KLRG2+|1bxFwkeEj^B!F;I4@p!KDM?KYr8;%H7S#ArFRPAS)!n-knA2vLe0#3%PAPqH3Bx(AWy; zrZK})P%Q83ZDfUKE@#z=^Cz-Kqo`AdgYwYKrZFmU?{bku_8>qzl1^WTMCU9$RN`?IJC*{mLH(l>OOK7lSh zLM-$;(m%_r7J+#yk^MEWP_({)YipR>Ar}>o5EY#Vlg_XoW*U3n`;z6E6 z70{p*{TEBvLPRL*Ld(H{a<1NDwcrVLOHDPOM6*Ans?!g>EW!hCAOc?tzYCak61e<3 zHGQe*)^Kyx##HV*lIf3RuJxP&&P?ZkQ|g?A3ynPSp1Pj3PL{Kcb^dtdCu>AOAlMBM z^eXyw4oY^?a}JV=BdR(^?Dwc1p|9$RPJ7QIo-|KQPab+ygg^Ne9l8z%hF6jU@^i4`-IcnLR{ch4fm&_W-Mv`4KsDjEm4 z_oO4+0!KNto=ltcPX{L)vV7oAXjaU2K3f%vupJy zy-?m=KSb9;ymW8bZUDZ@Vx7(=lKE-T;zEVXcq{Q2g8d7;GuyZJx*MA%?iWg z&0sZb3bleaoFz`8=MB&Eo`IfD@Su@ry%XpJ=y%W`n`h4P#9mfKS3u)bYn1h|HJDwG zqw#@Lk=0J5wI5xoN+$LsZvbSwEB>WOSYZL)sy5mnnNTryUSFL?^BbsdRULDt*?^vE zV8zX-z3s#XKL&T=KH2~8P`}{t$rV^pUitb`^ZQL;+*#pFbaJqu zTxYhk(P`zG;OXbtM?Tn|4C@>gZ~^_kMJ>n+KVXCUyYU>dN;;fua5}4n-?LV}*!7JS zpz4FxB(sz?yrv+5h3qTWajsMOtiyLhy=09*mx{onjrIdnUyiX(KZ7;O5oCAwvx8?H z5&V-xq;IjZGlXL(cXL?dYN?;lKjP);I`QEBYtC@zeW#maI)-x@J$_%$)+fnj*X!=k z(a{s(sqINYpIczj1Ds;sia4l4oJXB5&OH4&Uh|NC#(CRm>{RFmaP6q}brPNKa$*RX>IF50VuYp#vAdykfk;HZ)P*(;CW(V-~A$rL0L@LBGcmmECJsV;v@&h-^6V zo$O*7h8@gj563Q0q9N7#x%!;G2J)|><~vPm=K-|-Wv4cNZx;9Z5eF{O^_@A+R_CA- z=*jfF;~DAs&hwJzAr z*E3!ZdEM(Z)azTX?p`N6pL@D`&cmOvR7|$(Y`s%gb$U4Op-+pPi&T)F;=QpAAVdcC z7>LJPM&0jg>^%pIT>>8tV&{2S`w?^^8y@`0PJwyQKi>KcO!$II^#b^^l6_M(tlwGp z+suBBd%^zNM1K++9E3w&VDf0zVrGzg9Kf?ZLzX>qdtY_|HAM}e< z0dE;c(4j)&k;kz3*T~mCB%Vt_2R?%*@7bf+k1cQR zSC@RWoH(=;Y)ZzD`#7)S2R{N63wZ-59GidKc^=>PinG+Y<#>bfRXo)_rsoDc+2njd zoaE!I)KlTs3SGvkCQBR zHS+JvTFiKAy6f$1{F}UG_y^WM0aRScex~_c&qV`1XAgr^^wP*>TEGLTXO+RtOIBMd zI1_X%)DCjmQ3-v4DCT`Q@)i-{Ok}&!8R0zVJcQl91X?`pq&ptR%ekdJVCjp_3}>XX zRbRp46>l^Y63-ncrYYo*9QYQIlgGW|vCkv_JoJ765!zv5U#Y)IhAR1~RAdei59}s# zD?|^nv76ucIf>}7tJT7~Y%Vk3H=CRL(4d;+;;A6~_q=1Z9qfzLN6CBw(Et8a;s1rl z8b)0H8o%ErH+=z1e-53G1COiY&2Qq7uEFo?%=0)il>LhJz{`R9u=YldX;AqTbUh6x zGSP%Ys0?wg=^A*%HcqnB-f0Q-b*PsTBTP zOgAEy;aET+(b0WmZ%t`^8?Jfi*PoG^q7RY$Zou=+LF*{8`aWcDHK_sy5DEIQ5;6`e zco9@ACNut#SaAreL=~Vz44L9_IB}B$8{LPLC+lKE_Bxf?XEH7 zYJ5M4N0^1TTZF_D*kB}@u~Rp}9^NKnKF)hjFR}W)j*(5QJ|7z@ zheDCF7ZUeFlJ($t1Na>5{+6m*ea7~2N4kQISj?vslq^FFFTjmrWV{jl-;b7_hF@h+ zy&bez&rjJ&v=B6$Yo*b%@W_{*yr5lr6`V~#?>!VDg&SDlaYo5!Twmt+BwTxiHNi-%ya>s^#x93_?V`__wRH zlYIg~=uQ&&(H1JAIqD*jP|oW@V-uc}zd|InI0b$R@>k(bLsxGD7{#Bls?uu|o>1OQ z{1v%>jqeY~E6D2f8L+Q`D}yll2<2H>)d)v!er}Cc>`3^jurXP)QtsW?Y=5GmI6PZV z=P(@jfwj@=y0&u^u3RHZYDtv!GoB<2{`+931~c?^XD)hBgBjO^t_XBM*8dbTzQ$G2 zLT&FtmWA+oHyU>WOf-;SYgSIX^XY{L%>YeOkXbYM5RBy^PKf?a2B@gW|K$f=xL4uh7t5>}Z(+612qYMKYsuJckd|1|qctw5I^+mvfu~ zPXZXNAzW-j`y{;U6#VLH{ixHI{9qGjMY;`m7v=;zdsyq<2FkQ!781*ta6f>b9(oWO z1K>&slB>=&#VA)n_-uUne6)BCC^nUxXP%o4Y((ET;qxS>sbCIXaIykE`MYyzj+9cF zc?)zPgjtuPTL-Oj&|vABL$xHMczLU{WPN1wVsTqF0z$*8t9ro z&&EKV%&!HrmbD6lu_EAoUwUW<=fmi=0o-YW9^I>(JMTIj9m_eP8xsxuNR00|Z?Z}< zi01`4WS>(xt4P7v@-4LIj4P)z^jm^AILNi@T$PAl-Uuk={thg9F+7-rcVC1r$c7Uu zxw0DXe*#-PO{**P>4VLR&PXLO5M8Rvc#Y7jT&hoJ(S&ofxr_#tF@_KK!TJn8+W@|qXphaTZX&rj z#=Qymytxv{d}OaoB(o2KpLx(HHGplP%yl9HdEYDi&yPK#CMWRr=Ok*7v%Ao-9#y$i>^8O!Ex2JFOc8Ze6nv=2gZ;-i9@ zPg5k>lp_c`yoGnH&vW$|Qz9MtOA7<2tF%IcTO-FdNLJn~Ok<6u4N{jCx+vzD1SeX< zi$;tn@m>ID9)31NZUOFCva;J6eGttIg5M!pne}H2U7T&}4ea+o5M2E#s$jaSK z5TgiWDJ19Ef$x#O;}--~y|9;7WQpCG&j4cG9?;pG=UPHdELPT%F=Tg$$fT(|YAmy? zg6$OY7hsOEb1IQJwM2X6Nm&Ir@)Zg=%PfWe#|b45u@OMjGIO8;`R$(he%gLRm@VAlXQCrD;akFzq>30`+$CW4fq%vM&h1s|emnT)-N#U^l`iPZW) zMN5uQxONl%U9~dVJ@XX&l|3VpD>ZcGPch{Y0#CzZ0d4db})s&FOzxOXo&cZXzq7JlON*qFj|!g zm&HPw(1#$d>|GH&JB>!yW9FgcZa1)GvHvKp$Ud5DR*GwZ55VKD%%>UGWK~PD7Fpkt zJgo|PU6q-MrPP7u9o+Y0hOOYI9Br6II(j`2t?Yz^A4US5Ig4i|vThU%rzFbsLfRKW zsxZcHjvPh){w~KG(^ot*Y=sVHlJ)hZ$G&J|J2brq=W%YoY20r`&wbI0F5F3hBFO_K z3Xs@DR(VSCF5=0>D#GDUCaqiYO(g9FZAtXkAAa4(Na9Pv+<7*FYq1jq?1^u2L1Wa zo5(d1$p;}D!4|RmZj711^PTBMu)7Tz%nQ_G(wR*v^b3E&c%}vYrt^uW?+ADp&OPzY z(Tp!>7{QZL>z9?(6gb$0`x3`nXu)}Wjd-=DP!qung{B+${EK*|G9((pA^Xc@?@VVn z70PId&@2)Y|5_goNE9asRh=GMqQjE)X`Yk(C6GhBvUn58Pa4rf3I1aavGQuH`N7!ih~Oto)XU=#Pw$MEK#xK`|;2w9hdklwUMT9 zs5R}<=t(?**q!6f>^5^NXUwgvQj8{NpNNl>8p=5;dE({kFk6YFYs0e#kh$y;2!}6~ zwk4TgALdep5${E(yEBgLCW?W-9rz~uizGj6jdUdPi{y~qbq#6J0_&1JXMVglJ&U|y z8<^n>cHY9zM>3k|Wq0>Ek%z?V4sqLlq5}`@Br|9Uw{N4})sTKes1222!8eEuOVGiEM5*~yCN(@4tCt*Ka!>giu_w{c zAo`GOMB?2nc#(;nh!)GP2iXg3&^ijc5gYG|Cwvqel^V3{#Ega;X?(j6eI3NJS#YK! zyc3@nhc4`3{6gm914W_GWiWOyRCpl?*=tc7dV-+!3e;EUx6R$=(2zpU4ftfzUtjut zke;)UZ7n{6&61VZeHDCIXT_m;nEt$Req-x;eWsV9c zs=-}}z*4~EKFp>V&KtB8zb3T+$v4UwPhu*`cqEz==}O-c!N~qJ@c>ECB|E@cLz7sF zKOdN1{hFP2>h2SrDu zrX@5~fm7A_S<1bMtoJRV#aa46xcIBbk-#HpQ)fnz9803C3OvJ2J`yp=`~n%L1^Cqi zYl%iz1fLoq@5bmg&{Xp1UtZygW5wDcM(H9$%3Vwhq(I6z0$@d26qwF%M=3+>U zi$tQo@^-&?*LT^~GuX8N$?+rMkK~(i%-A29Z-yohBQ!JDfU#AnfIdq za;UyRymg9b#h>{%XAS|}4WO?o&=pFr!SoTu2(6h%Ev$dC`xgPKGM4Nr7OWFLP>Z$_ z`}RY_dvmQR8q}C4B(95uqGSmDa_;? z^hd!}v4}df_s6p29iQ6tC4M2E{w1Q9$SQ;$gy-p;6`}L80&y7xY0B6(xVw#b;3m`t zlRa&qhW-v?+=NfNX?qfFk^ENbN^Rk)=wg2mEE73)r_Xri>*4Pktzr#h7?mIKkX4aq z98c8d@W=8b3lxSH?)-dLZ)_&+f2ma3LJb#^Qs}jG9nkpc7X?VyRlk zBbih^dwD%a#2&NpPhuK(}RJsh;dxbFPd(eqW3#bReY{the!qd*+ zxg+3B9yRJ*VmI0M6+uNcmtC?GvAJ42FO|5X&?A4_qdxPI8lNvnCs|!(L?XWE2z9QB z>=FAHJAN0yMTfad^}X`1rpk`kK%^iUlVpYgaHflkRV|^YD|71)FFRq8C3yYS%Ts6t zQUPoauS=*z{Rrhhu@_B{M)u^${7+&3=a_E?){@nkpW(G<|)CkRJhTP<6g9}Cbn_}Uvr6{dFV(4Gn1Uq z8(q|JzXY@`;3K=E&bZR80d{R>#d$K^T7jHo@03)cBvO<7Ok(eDtZz(WkA>_o5pPfq zR}NFNI>jONj9_%21Co$vTaZnvvh_f2S@T^%jc+XaUktw`n->2f>-$^Kv1*{3Xsu{b z6&LNjpf8&FN-idOhHW39hlTVrnln=xsWfe~LfNrI}_$RhFjeh14%l!ep@)ztk)B7*{z4tlH^%PG@`>Rk?1#Ol!K_573 z6A4@+Tb1{q0^pxdnV3f6te4fawG8?Ey zpJ%x)NG;e|hMq{Keu7UCV>^uKk5x#%p;#%H41UOdq2JNzmFUqaOX6?wVVPt~-O*sd z>m+`ra^4o5mQ}8U>^ND(F^lz*xllY4`e!1GN%S|I>hlb2%xD@e?)u4uNK3Z3q+oWk-O~bEr&-{SUV}<`FxHeaJYm+R&93@Dda)%0uW~=+}-AS z_sz({8~Q_#kD!>0cosaA45BKzg^9E!ju76-o3pak;fD-lzuX^Gish|>lZ<-{o;g^F zWM~>$NZu#)P1&Ix1;4H{BhmGnP$O7#1C;cjiL&E-GuW^KO<0EpEQA{i;iat0)dj&N zS_=Z{Br8ruv&2iqpwBVP?GSBeA;mLr?3g7Ln2s)HOWY~8Ap5F|T&voLBzJ-^m3*JZ zb8FFpUC=O%y&baur7ELI>?e_zWZ|`mEVnVAWzZ(oBw6(dg5y$A^d$n3_(OKlOB^p% z1B#2C_-A2TV19*N3p!B43$*N2ylIi!jvwl90oijZOf65GvD$c)6d zieG96#m&(0P&BDMT9pJ(B)=|U{^A$b^Xz)GR#qv!n1fU(B=?l{5{We>r;@s=4+wOE zBNtxJ1otMRdvl3^w!x(ew5I|N$sV6lXp@-uI{Z6@UIoHei8b56eX(cBns4Gyq#hxj zU1ZXZL;h-oM37+|lA+4(REez~0;MFrlI$q}{?=rUvK}YZbJ=|$zEbMIY4n!FY^9XQ zQs0&ejzo!rcw_rD@K5UM_v0VBfcm|O9{S-AdJ)|{!~IwI)*Bt|%BLf9liap)ouW29 zO5I!{o-S0!9wj1tllt(h^xN0Y>CeH{ezdcqd+XJKCRzJ*-5($KzdxqYdlfjd&h^mu`7l)~3XFSJr}L z)u1h$6i?TKZ1D+3x(DeeLvtE)&wws@7txPrWR0vE(vn(9b7mpBF1bxAZNzup&rEyr zjATAhXr}DpuE(sUKUrmL!#&~K1JKeRp0$Thk#I}uZ;}cm{5|lk5SuGN<1e^&e1j{coXZZO8`!?) z$%b$sf}c`bk!oK{WIKq+GLsp~EbryKBbwVB+NEM4`wcpBR)^7>GY|3bvW9&C37o;A zJa9u+0#Z2S&5UF;N>*YjtC4M>uK|bfCIZR%G6Ts3Lg`uTLRONb-nfzZpX3mH-$l=| z)+RnzymSpLL{^f;<^mbNDID#<(ThyG3p^AI2?VpmUr61eDNnWLZco~@M9Q)cpSp|{ZQL-*BYg%Fj;xD95mP!j*7p=`)C0Y=VA@u=y@1`MKiiT%B zp;+E^&2sbg#$1y+Lk9dvM%wM@Ro-iCOs{o8fHCnPeS`fWWF1#U5Y=O zh(A9IB~p>@2(99C+tTMiG^!ms*bb_b&>)FYBd}+&zBbHM;?~~iWG9Y$m`^uk+0U&9 z$$rg8snk7+{@usXmli!hlXPgS1$PDg1ZhMgB|jI0m38cv%vi9uHq^?yY2hxuwdHxS zm%;GA19ZwBk>((wj4##fHasmhYQd)(*mZSi?}_%m41c8RlE8Skz%SW}I10=fNi_T` z5tHoF&Bjap0djA~qUEhtk(xu+B=rEv`s3(bxG7#tY69ZnWL>2>9G5DS?93MbB5#sO z-XOkS)_B@LZ3eUL!01wKlvqZp@)F^7LVjK7HJrIiy;36GfzZ~BZz=RAI4*5vl};+I z?U73p`V}uC8r+nh(JpRE9Mg!NWOck9t;Ao&yHZM^RRjFzB`7(8jjkthSp-gv0=vH< za{mRX%Rcp`oPEgOa{CS}SPZtw8y2#Ly&V*n{8IkMBw5jsoTddmi_Xg1P*PX*A|8?0 zJ9A|q za=s5Ol3I2;W63&9JbVp?8{s_nJh~t?63MDETz!>#dK{y+XLixF3B;ECT*_sgJ{OES zP24<(XyrQ)e+{U;28qu>?tkFTr3y0w?#u_tKjp6*je{c#@OARP2Kc)gCF@8cn*>Hn zfv>XScGb-|OWZ0-5w;_5-D>ngc76pSyUy+*Rr(a36AcnAmfBGU^t6R5t>K2)gVe00 zM%t7+1E75%v`dw}9ek5IUI5Z=$Gr#Pji9&Wgt7)BIhF^zu6$!t_NB^-oFC&~L*j?P z>77{lMx?olpR2I*Nof3^M2b@LSq+VUkRQn!?o!T{!jJFBXvL?002j8R|01()%tY3@ zM5|?=Q)N7M4NH-=i+X5eYq&19Tb&w(>@cW>rZ$GL*C064wYkx{Ffu^u~=`)~l8C36t{!uQEegY9C zR~e12tm3Kfv4k<)nFE*p1Qq_pw;9CB7r~L6MD!s@L@-UN&w_ArCadP+8=7!UsyBk! zaxVdi)@8<0O_DtyN&IZW)ifv(j!F%?GaexoepQERsan_JPD`GZRsIlq7T+uD*wR{J z4Dlp>@XsQPIq$|`Qpeqjwo7ET01C!I^K@pvgn5s~w&#KRW0Cw%(EJI1!R~9gwaK*# zd2eGeHop{25Zf5X{rA8Dp?Ee=O+|ag@N)v*V+MOv|6ru!WN|U@KLaYnGGr%sGbol- zAd!n8yl95}g^1Q@XKxOvXG?85#pRNC))3dDN{-P5t%{|md*FlYHLbz3(X{S?59kU0 zq;WO?3zK~h!Jyk^qP1cuKTQmi%e+>CZwIKn$bPG}NLTW+sc7_g=$^*xCn4vNe7^oa zP2EXsT~`?gaD!2W5CSS8V1Za5B*qOD1$DuKGPDvykyx-oVgXV>sJIYlprlC}$MNtS zKie@L-m{;_=Xs3dx=EbDv|pK$H|EG&ulI|bVhy|CR*j{32XO9L zSo#i>hr%>`C$|!T)i({*STsHIjdhc6aC?nU3tYTfps((!BG!-{O?rwwBzKtmMTuOW#9N+&`Ixgjo$z3p7k(SxU*|XXiCZ6a^;_S-y*;-#YB8Ptx^Zxe-i<}O*FK74oio+F3sLh9r7 zeK;Rr`MXwd7jYl?Jk4L?9Xnt)S^l#k{-4;<0se=Ke|1M<<%Ax_i}fdBB)jso-tvDE zuR4E)FC@G6FZtw8&~Gx<@y@L3zkvSS#rho0M(dx!f9^r%PSAVV|5r)iv#jQ8jzsxS z7m4J_r|cZap7wTkDqbS<%l!Ix?EAJg+a>*96RQsS?FC#u#hcHdu}*iVo%N$}kLB0I zy%{v#<|8*rVpFTkd?rX^9?u7A_thtpmiw>=cz+r6E)S^kgYe!B#{|5CMGmXVzp^{; z1PL$baTsLY3ce-wCLf=>;kj3OKOHAyaUcKxu(PaLva)F9I&x_RX!AUeJcM@)H*++7rBE78E$K74?AY{A+ke2Sdue@@7WYYG3%$pAZwGr1 zb(22BtbB}JOp9q%dTy|UO0o0_c5{hz4qa{e`W5&W3hW5j9{%$b-iOgJ1!4(~9`zGS zhdl8LO{|mj20O?z?8$%R4#O+#;h$E)x#RjpvdFV?8S&nY-#o?fh`OwH^Y-w}ChpMn z-`HQWNjpU^$>+|Zeb6WQh0ISwPp@(>&b!f7$IGUx>-6|XpUZT3iT2|C-HzTucXs5S zF}v?8zJ7y+HQp}F$kk|FrJ^WTGh2GyhR zh2GP!#h#mmuFTPrZCci2^jT4^`|N4=DlKf0Sv+JLM}z3wSCi;=v0T|9lLM4);Hu@# ziF9MA9b_Y&O6zb<7T#l7!M^NR!p*AM)cwr6oxaun-S!{0Ki2+ZbHg5WRaY9W50mt9 z5?+A+v{J9W>uQ%-W5&$PsOl(4t|%*x74#gkmApsnLgA@(Z>ou+TKf^Q?9yV21TXQ& zT`l6diSTRW5n5|S&RNNJ@qj0M_wa%r={a$&4t_@aT|HGD=hS!c*q$=B4Xf$K^8y^< zbAvs+Dk?0HLk@YKLEW9Lsm^o+Q$Y)90PQowe@!K!0c?BM0@U6MsBTj<}^ue zt6Rm}yf)*?Yw$LFHsQSj!xYNSli4}7**SdHrwV(OJRfIyPvGMSo_h2>%VSz^Vjcn; z>YBJd!VZqI)DE&3LB%Fcw!r0kNt;Eh&s`^?*;fCH4 zIU3G3(D|G>&|mev0_S+~u`a#le&hz*tH2wnCo2#S44x@hqo3^V9$_V?NHsoI1)C@{ zskW=|IYd(VuQ3CTMflg16TzBbBB|uz5{c$n@{6KE-m9M25&cXT3Y(yYz{DHI)Ma%t zp{7#eZ>#+vNuMP3tmg;Nd_qL%P%mt=V5K1nRcjC-;Su zjx*oPJ%dE{{O9$|$Xd2E#2!}Q?bE73dN2;os1CM$n9Z(OgyvnYUde_d|T|y3%9=FNw{GQHf8@!a*JUtn!Z9 zy+Z3a4WyLXi+&dhM{D4+l9+?3AGh5;{piYl!i*W~@NN_vh^KUt+!+2sb;IwRD-G8t zSx?6C^I%3mMC0Sd9=dRK%JrHSP14PZW&l)jc^RQIl4Pb7Jees(%Ofb7Lg5}R=lRAl zQk%lzU+6n?=)|F$tnL=RZ;|sYK6u@6plzL2w&A&4AQL0vJIf`y=b8T*=gA-}p{O+z z>PJmxELmB_suES=eK~f?Z$;l(d&TQAlRJ&Z*h=KphnG>GZsphCc>Rq>9&BrSvaPM{ z9o~uXwzjt@l|O0NQhuvf%idM?Utdf8djPe+Sq|Jica`sb6aL{3wD~U$E%?B^?=C#= M|9`pd?WMc_0gd#p&j0`b literal 0 HcmV?d00001 diff --git a/tests/features/test_dicom.py b/tests/features/test_dicom.py index 16ea0b27900..577309da3fa 100644 --- a/tests/features/test_dicom.py +++ b/tests/features/test_dicom.py @@ -96,6 +96,24 @@ def test_dataset_cast_dicom_column(shared_datadir): assert decoded_item["bytes"] is None +@require_pydicom +def test_dicom_force_parameter(shared_datadir): + """Test loading DICOM file that requires force=True""" + import pydicom + + # File from: https://github.com/pydicom/pydicom/blob/main/src/pydicom/data/test_files/no_meta.dcm + # This file is missing DICOM File Meta Information header but can be read using force=True + dicom_path = str(shared_datadir / "test_dicom_no_meta.dcm") + + ds_no_force = Dataset.from_dict({"dicom": [dicom_path]}).cast_column("dicom", Dicom(force=False)) + with pytest.raises(pydicom.errors.InvalidDicomError): + item = ds_no_force[0] + + ds_with_force = Dataset.from_dict({"dicom": [dicom_path]}).cast_column("dicom", Dicom(force=True)) + item = ds_with_force[0] + assert isinstance(item["dicom"], pydicom.dataset.FileDataset) + + @require_pydicom def test_encode_pydicom_dataset(tmp_path): import pydicom From 092a94f8b95f65222c167c22732a00698a2fbe4e Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 28 Oct 2025 11:30:53 +0100 Subject: [PATCH 06/10] allow dicom force=True setting --- src/datasets/features/dicom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/features/dicom.py b/src/datasets/features/dicom.py index f68c74eccfe..839fa4722a5 100644 --- a/src/datasets/features/dicom.py +++ b/src/datasets/features/dicom.py @@ -54,7 +54,7 @@ class Dicom: >>> from datasets import Dataset, Dicom >>> ds = Dataset.from_dict({"dicom": ["path/to/file.dcm"]}).cast_column("dicom", Dicom()) >>> ds.features["dicom"] - Dicom(decode=True, id=None) + Dicom(decode=True, force=False, id=None) >>> ds[0]["dicom"] >>> ds = ds.cast_column("dicom", Dicom(decode=False)) From 7b23de6a0efc3d42126106e97c406cc72c58cb96 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 28 Oct 2025 12:33:58 +0100 Subject: [PATCH 07/10] update documentation and add dicom_datset.mdx --- docs/source/_toctree.yml | 2 + docs/source/dicom_dataset.mdx | 121 ++++++++++++++++++ .../package_reference/loading_methods.mdx | 6 + .../source/package_reference/main_classes.mdx | 4 + 4 files changed, 133 insertions(+) create mode 100644 docs/source/dicom_dataset.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index cc6b7195fe2..08ba233ce8d 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -90,6 +90,8 @@ title: Create a document dataset - local: nifti_dataset title: Create a medical imaging dataset + - local: dicom_dataset + title: Create a medical dataset, containing images, signals or videos and additional metadata title: "Vision" - sections: - local: nlp_load diff --git a/docs/source/dicom_dataset.mdx b/docs/source/dicom_dataset.mdx new file mode 100644 index 00000000000..728690a4f9d --- /dev/null +++ b/docs/source/dicom_dataset.mdx @@ -0,0 +1,121 @@ +# Create a DICOM dataset + +This page shows how to create and share a dataset of medical data in DICOM format (.dcm / .dicom) using the `datasets` library. + +You can share a dataset with your team or with anyone in the community by creating a dataset repository on the Hugging Face Hub: + +```py +from datasets import load_dataset + +dataset = load_dataset("/my_dicom_dataset") +``` + +There are two common ways to create a DICOM dataset: + +- Create a dataset from local DICOM files in Python and upload it with `Dataset.push_to_hub`. +- Use a folder-based convention (one file per example) and a small helper to convert it into a `Dataset`. + +> [!TIP] +> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information. + +## Local files + +If you already have a list of file paths to DICOM files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Dicom` feature. + +```py +from datasets import Dataset, Dicom + +# simple example: create a dataset from file paths +files = ["/path/to/file_001.dcm", "/path/to/file_002.dcm"] +ds = Dataset.from_dict({"dicom": files}).cast_column("dicom", Dicom()) +``` + +Pydicom, the library used to handle DICOM files, supports loading DICOM files missing File Meta Information header or 'DICM' prefix using the `force=True` parameter, which defaults to `force=False`. + +```py +from datasets import Dataset, Dicom + +ds = Dataset.from_dict({"dicom": ["/path/to/file_without_meta.dcm"]}).cast_column("dicom", Dicom(force=True)) +img = ds[0]["dicom"] +arr = img.pixel_array +``` + +After preparing the dataset you can push it to the Hub: + +```py +ds.push_to_hub("/my_dicom_dataset") +``` + +This will create a dataset repository containing your DICOM dataset with a `data/` folder of parquet shards. + +## Folder conventions and metadata + +If you organize your dataset in folders you can create splits automatically (train/test/validation) by following a structure like: + +``` +dataset/train/scan_0001.dcm +dataset/train/scan_0002.dcm +dataset/validation/scan_1001.dcm +dataset/test/scan_2001.dcm +``` + +If you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the DICOM file next to the metadata file. + +Example `metadata.csv`: + +```csv +file_name,patient_id,age,diagnosis +scan_0001.dcm,P001,45,healthy +scan_0002.dcm,P002,59,disease_x +``` + +## Converting to PyTorch tensors + +The numerical data (signals, images or videos) are stored under `dicom_object.pixel_array`. Note that not all DICOM files need to contain these. You can convert these to PyTorch tensors on-the-fly using a dataset transformation. + +Use the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset: + +```py +import torch +import pydicom +import numpy as np + +def transform_to_pytorch(example): + example["dicom_torch"] = [torch.tensor(ex.pixel_array) for ex in example["dicom"]] + return example + +ds.set_transform(transform_to_pytorch) + +``` +Accessing elements now (e.g. `ds[0]`) will yield torch tensors in the `"dicom_torch"` key. + + +## Usage of Pydicom + +The DICOM files are loaded using the [pydicom](https://pydicom.github.io/) library. Therefore, you can use all functionality of pydicom to access metadata and pixel data. + +```python +from datasets import load_dataset +dicom_ds = load_dataset("/my_dicom_dataset") +for dicom_img in dicom_ds: + dicom_object = dicom_img["dicom"] + print(dicom_object.PatientID) + print(dicom_object.StudyDate) + pixel_array = dicom_object.pixel_array + print(pixel_array.shape) +``` + +You can visualize the DICOM images using matplotlib as follows: + +```Python +import matplotlib.pyplot as plt +from datasets import load_dataset +dicom_ds = load_dataset("/my_dicom_dataset") +for dicom_img in dicom_ds: + dicom_object = dicom_img["dicom"] + plt.imshow(dicom_object.pixel_array, cmap=plt.cm.gray) + plt.show() +``` + +For further reading we refer to the [pydicom documentation](https://pydicom.github.io/pydicom/stable/) and [tutorials](https://pydicom.github.io/pydicom/stable/tutorials/index.html) +--- diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx index 4792d1b88f7..dbe840deeea 100644 --- a/docs/source/package_reference/loading_methods.mdx +++ b/docs/source/package_reference/loading_methods.mdx @@ -109,6 +109,12 @@ load_dataset("csv", data_dir="path/to/data/dir", sep="\t") [[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolder +### Dicom + +[[autodoc]] datasets.packaged_modules.dicomfolder.DicomFolderConfig + +[[autodoc]] datasets.packaged_modules.dicomfolder.DicomFolder + ### WebDataset [[autodoc]] datasets.packaged_modules.webdataset.WebDataset diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx index 84e651f9171..873a8c72029 100644 --- a/docs/source/package_reference/main_classes.mdx +++ b/docs/source/package_reference/main_classes.mdx @@ -275,6 +275,10 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable [[autodoc]] datasets.Nifti +### Dicom + +[[autodoc]] datasets.Dicom + ## Filesystems [[autodoc]] datasets.filesystems.is_remote_filesystem From de0787fc4291cba5c205a2586df14e170f30ec56 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 5 Nov 2025 18:39:47 +0100 Subject: [PATCH 08/10] add medical imaging dataset markdown file as unification for dicom and nifti --- docs/source/dicom_dataset.mdx | 121 ------------------ ...ataset.mdx => medical_imaging_dataset.mdx} | 100 ++++++++++++--- 2 files changed, 84 insertions(+), 137 deletions(-) delete mode 100644 docs/source/dicom_dataset.mdx rename docs/source/{nifti_dataset.mdx => medical_imaging_dataset.mdx} (51%) diff --git a/docs/source/dicom_dataset.mdx b/docs/source/dicom_dataset.mdx deleted file mode 100644 index 728690a4f9d..00000000000 --- a/docs/source/dicom_dataset.mdx +++ /dev/null @@ -1,121 +0,0 @@ -# Create a DICOM dataset - -This page shows how to create and share a dataset of medical data in DICOM format (.dcm / .dicom) using the `datasets` library. - -You can share a dataset with your team or with anyone in the community by creating a dataset repository on the Hugging Face Hub: - -```py -from datasets import load_dataset - -dataset = load_dataset("/my_dicom_dataset") -``` - -There are two common ways to create a DICOM dataset: - -- Create a dataset from local DICOM files in Python and upload it with `Dataset.push_to_hub`. -- Use a folder-based convention (one file per example) and a small helper to convert it into a `Dataset`. - -> [!TIP] -> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information. - -## Local files - -If you already have a list of file paths to DICOM files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Dicom` feature. - -```py -from datasets import Dataset, Dicom - -# simple example: create a dataset from file paths -files = ["/path/to/file_001.dcm", "/path/to/file_002.dcm"] -ds = Dataset.from_dict({"dicom": files}).cast_column("dicom", Dicom()) -``` - -Pydicom, the library used to handle DICOM files, supports loading DICOM files missing File Meta Information header or 'DICM' prefix using the `force=True` parameter, which defaults to `force=False`. - -```py -from datasets import Dataset, Dicom - -ds = Dataset.from_dict({"dicom": ["/path/to/file_without_meta.dcm"]}).cast_column("dicom", Dicom(force=True)) -img = ds[0]["dicom"] -arr = img.pixel_array -``` - -After preparing the dataset you can push it to the Hub: - -```py -ds.push_to_hub("/my_dicom_dataset") -``` - -This will create a dataset repository containing your DICOM dataset with a `data/` folder of parquet shards. - -## Folder conventions and metadata - -If you organize your dataset in folders you can create splits automatically (train/test/validation) by following a structure like: - -``` -dataset/train/scan_0001.dcm -dataset/train/scan_0002.dcm -dataset/validation/scan_1001.dcm -dataset/test/scan_2001.dcm -``` - -If you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the DICOM file next to the metadata file. - -Example `metadata.csv`: - -```csv -file_name,patient_id,age,diagnosis -scan_0001.dcm,P001,45,healthy -scan_0002.dcm,P002,59,disease_x -``` - -## Converting to PyTorch tensors - -The numerical data (signals, images or videos) are stored under `dicom_object.pixel_array`. Note that not all DICOM files need to contain these. You can convert these to PyTorch tensors on-the-fly using a dataset transformation. - -Use the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset: - -```py -import torch -import pydicom -import numpy as np - -def transform_to_pytorch(example): - example["dicom_torch"] = [torch.tensor(ex.pixel_array) for ex in example["dicom"]] - return example - -ds.set_transform(transform_to_pytorch) - -``` -Accessing elements now (e.g. `ds[0]`) will yield torch tensors in the `"dicom_torch"` key. - - -## Usage of Pydicom - -The DICOM files are loaded using the [pydicom](https://pydicom.github.io/) library. Therefore, you can use all functionality of pydicom to access metadata and pixel data. - -```python -from datasets import load_dataset -dicom_ds = load_dataset("/my_dicom_dataset") -for dicom_img in dicom_ds: - dicom_object = dicom_img["dicom"] - print(dicom_object.PatientID) - print(dicom_object.StudyDate) - pixel_array = dicom_object.pixel_array - print(pixel_array.shape) -``` - -You can visualize the DICOM images using matplotlib as follows: - -```Python -import matplotlib.pyplot as plt -from datasets import load_dataset -dicom_ds = load_dataset("/my_dicom_dataset") -for dicom_img in dicom_ds: - dicom_object = dicom_img["dicom"] - plt.imshow(dicom_object.pixel_array, cmap=plt.cm.gray) - plt.show() -``` - -For further reading we refer to the [pydicom documentation](https://pydicom.github.io/pydicom/stable/) and [tutorials](https://pydicom.github.io/pydicom/stable/tutorials/index.html) ---- diff --git a/docs/source/nifti_dataset.mdx b/docs/source/medical_imaging_dataset.mdx similarity index 51% rename from docs/source/nifti_dataset.mdx rename to docs/source/medical_imaging_dataset.mdx index 2770460fbaf..04fedcbb11a 100644 --- a/docs/source/nifti_dataset.mdx +++ b/docs/source/medical_imaging_dataset.mdx @@ -1,26 +1,34 @@ -# Create a NIfTI dataset +## Medical Imaging Dataset Guide -This page shows how to create and share a dataset of medical images in NIfTI format (.nii / .nii.gz) using the `datasets` library. +There are a couple of formats commonly used for medical imaging data, including DICOM and NIfTI. This guide covers how to create and share datasets in both formats using the `datasets` library. + +These are typically used for: + - NifTI: Storing MRI, fMRI, CT, PET scans in research settings. NifTI stands for Neuroimaging Informatics Technology Initiative. + - DICOM: Storing medical images in clinical settings, including metadata about patients and imaging procedures. DICOM stands for Digital Imaging and Communications in Medicine. + +### Create a NIfTI dataset + +This page shows how to create and share a dataset of medical images in NIfTI format (.nii / .nii.gz) or DICOM format (.dcm) using the `datasets` library. You can share a dataset with your team or with anyone in the community by creating a dataset repository on the Hugging Face Hub: ```py from datasets import load_dataset -dataset = load_dataset("/my_nifti_dataset") +dataset = load_dataset("/my_nifti_or_dicom_dataset") ``` -There are two common ways to create a NIfTI dataset: +There are two common ways to create a NIfTI or DICOM dataset: -- Create a dataset from local NIfTI files in Python and upload it with `Dataset.push_to_hub`. +- Create a dataset from local files in Python and upload it with `Dataset.push_to_hub`. - Use a folder-based convention (one file per example) and a small helper to convert it into a `Dataset`. > [!TIP] > You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information. -## Local files +### Local files -If you already have a list of file paths to NIfTI files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Nifti` feature. +If you already have a list of file paths to medical imaging files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Nifti` feature. ```py from datasets import Dataset @@ -35,7 +43,17 @@ ds = Dataset.from_dict({"nifti": files}).cast_column("nifti", Nifti()) # or a dict {'bytes': None, 'path': '...'} when decode=False ``` -The `Nifti` feature supports a `decode` parameter. When `decode=True` (the default), it loads the NIfTI file into a `nibabel.nifti1.Nifti1Image` object. You can access the image data as a numpy array with `img.get_fdata()`. When `decode=False`, it returns a dict with the file path and bytes. +For DICOM use: +```python +from datasets import Dataset, Dicom + +# simple example: create a dataset from file paths +files = ["/path/to/file_001.dcm", "/path/to/file_002.dcm"] +ds = Dataset.from_dict({"dicom": files}).cast_column("dicom", Dicom()) +``` + +The `Nifti` and `Dicom` feature support a `decode` parameter. When `decode=True` (the default), it loads the NIfTI file into a `nibabel.nifti1.Nifti1Image` object, and the DICOM file into a `pydicom.dataset.FileDataset` respectively. For NifTI files you can access the image data as a numpy array with `img.get_fdata()`. For DICOM files use `img.pixel_array`. +When `decode=False`, it returns a dict with the file path and bytes. ```py from datasets import Dataset, Nifti @@ -45,15 +63,23 @@ img = ds[0]["nifti"] # instance of: nibabel.nifti1.Nifti1Image arr = img.get_fdata() ``` +```python +from datasets import Dataset, Dicom + +ds = Dataset.from_dict({"dicom": ["/path/to/file_without_meta.dcm"]}).cast_column("dicom", Dicom(decode=True)) +img = ds[0]["dicom"] +arr = img.pixel_array +``` + After preparing the dataset you can push it to the Hub: ```py -ds.push_to_hub("/my_nifti_dataset") +ds.push_to_hub("/my_nifti_or_dicom_dataset") ``` -This will create a dataset repository containing your NIfTI dataset with a `data/` folder of parquet shards. +This will create a dataset repository containing your medical imaging dataset with a `data/` folder of parquet shards. -## Folder conventions and metadata +### Folder conventions and metadata If you organize your dataset in folders you can create splits automatically (train/test/validation) by following a structure like: @@ -64,7 +90,7 @@ dataset/validation/scan_1001.nii dataset/test/scan_2001.nii ``` -If you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the NIfTI file next to the metadata file. +If you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the NIfTI/DICOM file next to the metadata file. Example `metadata.csv`: @@ -74,7 +100,7 @@ scan_0001.nii.gz,P001,45,healthy scan_0002.nii.gz,P002,59,disease_x ``` -The `Nifti` feature works with zipped datasets too — each zip can contain NIfTI files and a metadata file. This is useful when uploading large datasets as archives. +The `Nifti` feature works with zipped datasets too — each zip can contain NIfTI files and a metadata file. This is useful when uploading large datasets as archives. NOTE: This is not supported for DICOM files. This means your dataset structure could look like this (mixed compressed and uncompressed files): ``` dataset/train/scan_0001.nii.gz @@ -83,7 +109,7 @@ dataset/validation/scan_1001.nii.gz dataset/test/scan_2001.nii ``` -## Converting to PyTorch tensors +### Converting to PyTorch tensors Use the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset: @@ -99,10 +125,23 @@ def transform_to_pytorch(example): ds.set_transform(transform_to_pytorch) ``` -Accessing elements now (e.g. `ds[0]`) will yield torch tensors in the `"nifti_torch"` key. + +```py +import torch +import pydicom +import numpy as np + +def transform_to_pytorch(example): + example["dicom_torch"] = [torch.tensor(ex.pixel_array) for ex in example["dicom"]] + return example + +ds.set_transform(transform_to_pytorch) + +``` +Accessing elements now (e.g. `ds[0]`) will yield torch tensors in the `"nifti_torch"/"dicom_torch"` key. -## Usage of NifTI1Image +### Usage of NifTI1Image NifTI is a format to store the result of 3 (or even 4) dimensional brain scans. This includes 3 spatial dimensions (x,y,z) and optionally a time dimension (t). Furthermore, the given positions here are only relative to the scanner, therefore @@ -127,4 +166,33 @@ for epi_img in nifti_ds: ``` For further reading we refer to the [nibabel documentation](https://nipy.org/nibabel/index.html) and especially [this nibabel tutorial](https://nipy.org/nibabel/coordinate_systems.html) + +### Usage of Pydicom + +The DICOM files are loaded using the [pydicom](https://pydicom.github.io/) library. Therefore, you can use all functionality of pydicom to access metadata and pixel data. + +```python +from datasets import load_dataset +dicom_ds = load_dataset("/my_dicom_dataset") +for dicom_img in dicom_ds: + dicom_object = dicom_img["dicom"] + print(dicom_object.PatientID) + print(dicom_object.StudyDate) + pixel_array = dicom_object.pixel_array + print(pixel_array.shape) +``` + +You can visualize the DICOM images using matplotlib as follows: + +```Python +import matplotlib.pyplot as plt +from datasets import load_dataset +dicom_ds = load_dataset("/my_dicom_dataset") +for dicom_img in dicom_ds: + dicom_object = dicom_img["dicom"] + plt.imshow(dicom_object.pixel_array, cmap=plt.cm.gray) + plt.show() +``` + +For further reading we refer to the [pydicom documentation](https://pydicom.github.io/pydicom/stable/) and [tutorials](https://pydicom.github.io/pydicom/stable/tutorials/index.html) --- From ba1d8b62e244e8d5634b64022d493f56f1d59644 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 15 Nov 2025 12:16:22 +0100 Subject: [PATCH 09/10] add embed_storage function to dicom --- src/datasets/features/dicom.py | 42 +++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/dicom.py b/src/datasets/features/dicom.py index 839fa4722a5..a82b7336a9d 100644 --- a/src/datasets/features/dicom.py +++ b/src/datasets/features/dicom.py @@ -10,7 +10,7 @@ from ..download.download_config import DownloadConfig from ..table import array_cast from ..utils.file_utils import is_local_path, xopen -from ..utils.py_utils import string_to_dict +from ..utils.py_utils import no_op_if_value_is_null, string_to_dict class DicomDict(TypedDict): @@ -221,6 +221,46 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.BinaryA storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null()) return array_cast(storage, self.pa_type) + def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray: + """Embed DICOM files into the Arrow array. + + Args: + storage (`pa.StructArray`): + PyArrow array to embed. + + Returns: + `pa.StructArray`: Array in the NifTI arrow storage type, that is + `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. + """ + if token_per_repo_id is None: + token_per_repo_id = {} + + @no_op_if_value_is_null + def path_to_bytes(path): + source_url = path.split("::")[-1] + pattern = ( + config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL + ) + source_url_fields = string_to_dict(source_url, pattern) + token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None + download_config = DownloadConfig(token=token) + with xopen(path, "rb", download_config=download_config) as f: + return f.read() + + bytes_array = pa.array( + [ + (path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None + for x in storage.to_pylist() + ], + type=pa.binary(), + ) + path_array = pa.array( + [os.path.basename(path) if path is not None else None for path in storage.field("path").to_pylist()], + type=pa.string(), + ) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null()) + return array_cast(storage, self.pa_type) + def encode_pydicom_dataset(dicom_ds: "pydicom.FileDataset") -> dict[str, Optional[Union[str, bytes]]]: """ From a784445c48a90d0296f013129727b718ef8bab93 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 15 Nov 2025 20:17:49 +0100 Subject: [PATCH 10/10] add test for embed_storage --- tests/features/test_dicom.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/features/test_dicom.py b/tests/features/test_dicom.py index 577309da3fa..10cf1cafd35 100644 --- a/tests/features/test_dicom.py +++ b/tests/features/test_dicom.py @@ -1,5 +1,6 @@ from pathlib import Path +import pyarrow as pa import pytest from datasets import Dataset, Dicom, Features @@ -114,6 +115,27 @@ def test_dicom_force_parameter(shared_datadir): assert isinstance(item["dicom"], pydicom.dataset.FileDataset) +@require_pydicom +def test_embed_storage(shared_datadir): + from io import BytesIO + + import pydicom + + dicom_path = str(shared_datadir / "test_dicom_693_J2KI.dcm") + img = pydicom.dcmread(dicom_path) + dicom = Dicom() + bytes_array = pa.array([None], type=pa.binary()) + path_array = pa.array([dicom_path], type=pa.string()) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"]) + embedded_storage = dicom.embed_storage(storage) + embedded_bytes = embedded_storage[0]["bytes"].as_py() + bio = BytesIO(embedded_bytes) + dicom_file = pydicom.dcmread(bio) + assert embedded_bytes is not None + for key in img.keys(): + assert dicom_file[key] == img[key] + + @require_pydicom def test_encode_pydicom_dataset(tmp_path): import pydicom