From 4e18df164eec0a4472cb7703ef224e213b1e428d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:55:56 +0200 Subject: [PATCH 01/52] Add parquet scan options and docs (#7801) parquet scan options and docs --- .../packaged_modules/parquet/parquet.py | 56 ++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/src/datasets/packaged_modules/parquet/parquet.py b/src/datasets/packaged_modules/parquet/parquet.py index 10797753657..9921a2be6b9 100644 --- a/src/datasets/packaged_modules/parquet/parquet.py +++ b/src/datasets/packaged_modules/parquet/parquet.py @@ -15,12 +15,63 @@ @dataclass class ParquetConfig(datasets.BuilderConfig): - """BuilderConfig for Parquet.""" + """ + BuilderConfig for Parquet. + + Args: + batch_size (`int`, *optional*): + Size of the RecordBatches to iterate on. + The default is the row group size (defined by the first row group). + columns (`list[str]`, *optional*) + List of columns to load, the other ones are ignored. + All columns are loaded by default. + features: (`Features`, *optional*): + Cast the data to `features`. + filters (`Union[pyarrow.dataset.Expression, list[tuple], list[list[tuple]]]`, *optional*): + Return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the partition information + or internal metadata found in the data source, e.g. Parquet statistics. + Otherwise filters the loaded RecordBatches before yielding them. + fragment_scan_options (`pyarrow.dataset.ParquetFragmentScanOptions`) + Scan-specific options for Parquet fragments. + This is especially useful to configure buffering and caching. + + Example: + + Load a subset of columns: + + ```python + >>> ds = load_dataset(parquet_dataset_id, columns=["col_0", "col_1"]) + ``` + + Stream data and efficiently filter data, possibly skipping entire files or row groups: + + ```python + >>> filters = [("col_0", "==", 0)] + >>> ds = load_dataset(parquet_dataset_id, streaming=True, filters=filters) + ``` + + Increase the minimum request size when streaming from 32MiB (default) to 128MiB and enable prefetching: + + ```python + >>> import pyarrow + >>> import pyarrow.dataset + >>> fragment_scan_options = pyarrow.dataset.ParquetFragmentScanOptions( + ... cache_options=pyarrow.CacheOptions( + ... prefetch_limit=1, + ... range_size_limit=128 << 20 + ... ), + ... ) + >>> ds = load_dataset(parquet_dataset_id, streaming=True, fragment_scan_options=fragment_scan_options) + ``` + + """ batch_size: Optional[int] = None columns: Optional[list[str]] = None features: Optional[datasets.Features] = None filters: Optional[Union[ds.Expression, list[tuple], list[list[tuple]]]] = None + fragment_scan_options: Optional[ds.ParquetFragmentScanOptions] = None def __post_init__(self): super().__post_init__() @@ -84,9 +135,10 @@ def _generate_tables(self, files): if isinstance(self.config.filters, list) else self.config.filters ) + parquet_file_format = ds.ParquetFileFormat(default_fragment_scan_options=self.config.fragment_scan_options) for file_idx, file in enumerate(itertools.chain.from_iterable(files)): with open(file, "rb") as f: - parquet_fragment = ds.ParquetFileFormat().make_fragment(f) + parquet_fragment = parquet_file_format.make_fragment(f) if parquet_fragment.row_groups: batch_size = self.config.batch_size or parquet_fragment.row_groups[0].num_rows try: From cfcdfce542f7ea6a0b98fafa1fb8a78d65c960b5 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 9 Oct 2025 12:01:43 +0200 Subject: [PATCH 02/52] More Parquet streaming docs (#7803) * more parquet stream arg docs * minor * minor --- docs/source/stream.mdx | 17 ++++++++++++++++- .../packaged_modules/parquet/parquet.py | 4 +++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/source/stream.mdx b/docs/source/stream.mdx index 67f1ff420cd..aa72faff50b 100644 --- a/docs/source/stream.mdx +++ b/docs/source/stream.mdx @@ -19,7 +19,8 @@ For example, the English split of the [HuggingFaceFW/fineweb](https://huggingfac >>> from datasets import load_dataset >>> dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True) >>> print(next(iter(dataset))) -{'text': "How AP reported in all formats from tornado-stricken regionsMarch 8, 2012\nWhen the first serious bout of tornadoes of 2012 blew through middle America in the middle of the night, they touched down in places hours from any AP bureau... +{'text': 'How AP reported in all formats from tornado-stricken regionsMarch 8, 2012\nWhen the first serious bout of tornadoes of 2012 blew through middle America in the middle of the night, they touched down in places hours from any AP bureau...', ..., + 'language_score': 0.9721424579620361, 'token_count': 717} ``` Dataset streaming also lets you work with a dataset made of local files without doing any conversion. @@ -29,6 +30,7 @@ This is especially helpful when: - You don't want to wait for an extremely large local dataset to be converted to Arrow. - The converted files size would exceed the amount of available disk space on your computer. - You want to quickly explore just a few samples of a dataset. +- You want to load only certain columns or efficiently filter a Parquet dataset. For example, you can stream a local dataset of hundreds of compressed JSONL files like [oscar-corpus/OSCAR-2201](https://huggingface.co/datasets/oscar-corpus/OSCAR-2201) to use it instantly: @@ -40,6 +42,19 @@ For example, you can stream a local dataset of hundreds of compressed JSONL file {'id': 0, 'text': 'Founded in 2015, Golden Bees is a leading programmatic recruitment platform dedicated to employers, HR agencies and job boards. The company has developed unique HR-custom technologies and predictive algorithms to identify and attract the best candidates for a job opportunity.', ... ``` +Parquet is a columnar format that allows you to stream and load only a subset of columns and ignore unwanted columns. Parquet also stores metadata such as column statistics (at the file and row group level), enabling efficient filtering. Use the `columns` and `filters` arguments of [`datasets.packaged_modules.parquet.ParquetConfig`] to stream Parquet datasets, select columns, and apply filters: + +```py +>>> from datasets import load_dataset +>>> dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True, columns=["url", "date"]) +>>> print(next(iter(dataset))) +{'url': 'http://%20jwashington@ap.org/Content/Press-Release/2012/How-AP-reported-in-all-formats-from-tornado-stricken-regions', 'date': '2013-05-18T05:48:54Z'} +>>> dataset = load_dataset('HuggingFaceFW/fineweb', split='train', streaming=True, filters=[("language_score", ">=", 0.99)]) +>>> print(next(iter(dataset))) +{'text': 'Everyone wishes for something. And lots of people believe they know how to make their wishes come true with magical thinking.\nWhat is it? "Magical thinking is a belief in forms of causation, with no known physical basis," said Professor Emily Pronin of Princeton...', ..., + 'language_score': 0.9900368452072144, 'token_count': 716} +``` + Loading a dataset in streaming mode creates a new dataset type instance (instead of the classic [`Dataset`] object), known as an [`IterableDataset`]. This special type of dataset has its own set of processing methods shown below. diff --git a/src/datasets/packaged_modules/parquet/parquet.py b/src/datasets/packaged_modules/parquet/parquet.py index 9921a2be6b9..c6ef7b075c9 100644 --- a/src/datasets/packaged_modules/parquet/parquet.py +++ b/src/datasets/packaged_modules/parquet/parquet.py @@ -32,10 +32,12 @@ class ParquetConfig(datasets.BuilderConfig): If possible the predicate will be pushed down to exploit the partition information or internal metadata found in the data source, e.g. Parquet statistics. Otherwise filters the loaded RecordBatches before yielding them. - fragment_scan_options (`pyarrow.dataset.ParquetFragmentScanOptions`) + fragment_scan_options (`pyarrow.dataset.ParquetFragmentScanOptions`, *optional*) Scan-specific options for Parquet fragments. This is especially useful to configure buffering and caching. + + Example: Load a subset of columns: From 02ee330625d3276b57b88cd513ab628bf38d1a3e Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 9 Oct 2025 16:01:55 +0200 Subject: [PATCH 03/52] Less api calls when resolving data_files (#7805) less api calls when resolving data_files --- src/datasets/data_files.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 9fefd4a4c69..9710bc84a8e 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -503,6 +503,18 @@ def _get_origin_metadata( max_workers: Optional[int] = None, ) -> list[SingleOriginMetadata]: max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS + if all("hf://" in data_file for data_file in data_files): + # No need for multithreading here since the origin metadata of HF files + # is (repo_id, revision) and is cached after first .info() call. + return [ + _get_single_origin_metadata(data_file, download_config=download_config) + for data_file in hf_tqdm( + data_files, + desc="Resolving data files", + # set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached + disable=len(data_files) <= 16 or None, + ) + ] return thread_map( partial(_get_single_origin_metadata, download_config=download_config), data_files, From 5eec91a1a8634f9a1bd5bbbb19ed116d6bffc6e1 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:04:33 +0200 Subject: [PATCH 04/52] Parquet: add `on_bad_file` argument to error/warn/skip bad files (#7806) add on_bad_file --- .../packaged_modules/parquet/parquet.py | 50 ++++++++++++++----- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/src/datasets/packaged_modules/parquet/parquet.py b/src/datasets/packaged_modules/parquet/parquet.py index c6ef7b075c9..a583d31c50d 100644 --- a/src/datasets/packaged_modules/parquet/parquet.py +++ b/src/datasets/packaged_modules/parquet/parquet.py @@ -1,6 +1,6 @@ import itertools from dataclasses import dataclass -from typing import Optional, Union +from typing import Literal, Optional, Union import pyarrow as pa import pyarrow.dataset as ds @@ -36,6 +36,13 @@ class ParquetConfig(datasets.BuilderConfig): Scan-specific options for Parquet fragments. This is especially useful to configure buffering and caching. + + on_bad_file (`Literal["error", "warn", "skip"]`, *optional*, defaults to "error") + Specify what to do upon encountering a bad file (a file that can't be read). Allowed values are : + * 'error', raise an Exception when a bad file is encountered. + * 'warn', raise a warning when a bad file is encountered and skip that file. + * 'skip', skip bad files without raising or warning when they are encountered. + Example: @@ -74,6 +81,7 @@ class ParquetConfig(datasets.BuilderConfig): features: Optional[datasets.Features] = None filters: Optional[Union[ds.Expression, list[tuple], list[list[tuple]]]] = None fragment_scan_options: Optional[ds.ParquetFragmentScanOptions] = None + on_bad_file: Literal["error", "warn", "skip"] = "error" def __post_init__(self): super().__post_init__() @@ -109,9 +117,22 @@ def _split_generators(self, dl_manager): # Infer features if they are stored in the arrow schema if self.info.features is None: for file in itertools.chain.from_iterable(files): - with open(file, "rb") as f: - self.info.features = datasets.Features.from_arrow_schema(pq.read_schema(f)) - break + try: + with open(file, "rb") as f: + self.info.features = datasets.Features.from_arrow_schema(pq.read_schema(f)) + break + except pa.ArrowInvalid as e: + if self.config.on_bad_file == "error": + logger.error(f"Failed to read schema from '{file}' with error {type(e).__name__}: {e}") + raise + elif self.config.on_bad_file == "warn": + logger.warning(f"Skipping bad schema from '{file}'. {type(e).__name__}: {e}`") + else: + logger.debug(f"Skipping bad schema from '{file}'. {type(e).__name__}: {e}`") + if self.info.features is None: + raise ValueError( + f"At least one valid data file must be specified, all the data_files are invalid: {self.config.data_files}" + ) splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) if self.config.columns is not None and set(self.config.columns) != set(self.info.features): self.info.features = datasets.Features( @@ -139,11 +160,11 @@ def _generate_tables(self, files): ) parquet_file_format = ds.ParquetFileFormat(default_fragment_scan_options=self.config.fragment_scan_options) for file_idx, file in enumerate(itertools.chain.from_iterable(files)): - with open(file, "rb") as f: - parquet_fragment = parquet_file_format.make_fragment(f) - if parquet_fragment.row_groups: - batch_size = self.config.batch_size or parquet_fragment.row_groups[0].num_rows - try: + try: + with open(file, "rb") as f: + parquet_fragment = parquet_file_format.make_fragment(f) + if parquet_fragment.row_groups: + batch_size = self.config.batch_size or parquet_fragment.row_groups[0].num_rows for batch_idx, record_batch in enumerate( parquet_fragment.to_batches( batch_size=batch_size, @@ -158,6 +179,11 @@ def _generate_tables(self, files): # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}") # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows))) yield f"{file_idx}_{batch_idx}", self._cast_table(pa_table) - except ValueError as e: - logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") - raise + except (pa.ArrowInvalid, ValueError) as e: + if self.config.on_bad_file == "error": + logger.error(f"Failed to read file '{file}' with error {type(e).__name__}: {e}") + raise + elif self.config.on_bad_file == "warn": + logger.warning(f"Skipping bad file '{file}'. {type(e).__name__}: {e}`") + else: + logger.debug(f"Skipping bad file '{file}'. {type(e).__name__}: {e}`") From fd8d287a98c4970d4634f92f5d7b74eb73e7dca4 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:06:58 +0200 Subject: [PATCH 05/52] typo (#7807) --- src/datasets/packaged_modules/parquet/parquet.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/datasets/packaged_modules/parquet/parquet.py b/src/datasets/packaged_modules/parquet/parquet.py index a583d31c50d..52a675d41c7 100644 --- a/src/datasets/packaged_modules/parquet/parquet.py +++ b/src/datasets/packaged_modules/parquet/parquet.py @@ -37,7 +37,7 @@ class ParquetConfig(datasets.BuilderConfig): This is especially useful to configure buffering and caching. - on_bad_file (`Literal["error", "warn", "skip"]`, *optional*, defaults to "error") + on_bad_files (`Literal["error", "warn", "skip"]`, *optional*, defaults to "error") Specify what to do upon encountering a bad file (a file that can't be read). Allowed values are : * 'error', raise an Exception when a bad file is encountered. * 'warn', raise a warning when a bad file is encountered and skip that file. @@ -81,7 +81,7 @@ class ParquetConfig(datasets.BuilderConfig): features: Optional[datasets.Features] = None filters: Optional[Union[ds.Expression, list[tuple], list[list[tuple]]]] = None fragment_scan_options: Optional[ds.ParquetFragmentScanOptions] = None - on_bad_file: Literal["error", "warn", "skip"] = "error" + on_bad_files: Literal["error", "warn", "skip"] = "error" def __post_init__(self): super().__post_init__() @@ -122,10 +122,10 @@ def _split_generators(self, dl_manager): self.info.features = datasets.Features.from_arrow_schema(pq.read_schema(f)) break except pa.ArrowInvalid as e: - if self.config.on_bad_file == "error": + if self.config.on_bad_files == "error": logger.error(f"Failed to read schema from '{file}' with error {type(e).__name__}: {e}") raise - elif self.config.on_bad_file == "warn": + elif self.config.on_bad_files == "warn": logger.warning(f"Skipping bad schema from '{file}'. {type(e).__name__}: {e}`") else: logger.debug(f"Skipping bad schema from '{file}'. {type(e).__name__}: {e}`") @@ -180,10 +180,10 @@ def _generate_tables(self, files): # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows))) yield f"{file_idx}_{batch_idx}", self._cast_table(pa_table) except (pa.ArrowInvalid, ValueError) as e: - if self.config.on_bad_file == "error": + if self.config.on_bad_files == "error": logger.error(f"Failed to read file '{file}' with error {type(e).__name__}: {e}") raise - elif self.config.on_bad_file == "warn": + elif self.config.on_bad_files == "warn": logger.warning(f"Skipping bad file '{file}'. {type(e).__name__}: {e}`") else: logger.debug(f"Skipping bad file '{file}'. {type(e).__name__}: {e}`") From 7e1350b41d19eabc961e7de9e26f1aa09f18e110 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:11:07 +0200 Subject: [PATCH 06/52] release: 4.2.0 (#7808) --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0dee50b1f42..9ef73c44590 100644 --- a/setup.py +++ b/setup.py @@ -228,7 +228,7 @@ setup( name="datasets", - version="4.1.2.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.2.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 77f14553a3e..2d94e71b34a 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.1.2.dev0" +__version__ = "4.2.0" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From f25661f5993a74108d4214ab08df94205e8321d2 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:19:31 +0200 Subject: [PATCH 07/52] Set dev version (#7809) set dev version --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9ef73c44590..8641202a908 100644 --- a/setup.py +++ b/setup.py @@ -228,7 +228,7 @@ setup( name="datasets", - version="4.2.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.2.1.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 2d94e71b34a..f5b5a8c9892 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.2.0" +__version__ = "4.2.1.dev0" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From 88d53e2a46f4e73c5a9445c4661624e8cb96c04e Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:35:14 +0200 Subject: [PATCH 08/52] fix conda deps (#7810) --- .github/conda/meta.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml index 4b4a416d220..59a16cda78b 100644 --- a/.github/conda/meta.yaml +++ b/.github/conda/meta.yaml @@ -20,6 +20,7 @@ requirements: - dill - pandas - requests >=2.19.0 + - httpx <1.0.0 - tqdm >=4.66.3 - dataclasses - multiprocess @@ -35,6 +36,7 @@ requirements: - dill - pandas - requests >=2.19.0 + - httpx <1.0.0 - tqdm >=4.66.3 - dataclasses - multiprocess From 63c933ae552fdd7c535e6930d03d291ebc6c1d10 Mon Sep 17 00:00:00 2001 From: Marko Bakovic Date: Fri, 10 Oct 2025 17:04:20 +0100 Subject: [PATCH 09/52] Add pyarrow's binary view to features (#7795) This commit will be squashed. --- src/datasets/features/features.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index dbc3818e224..54d84ef33e2 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -106,6 +106,8 @@ def _arrow_to_datasets_dtype(arrow_type: pa.DataType) -> str: return "binary" elif pyarrow.types.is_large_binary(arrow_type): return "large_binary" + elif pyarrow.types.is_binary_view(arrow_type): + return "binary_view" elif pyarrow.types.is_string(arrow_type): return "string" elif pyarrow.types.is_large_string(arrow_type): @@ -508,6 +510,7 @@ class Value: - `decimal256(precision, scale)` - `binary` - `large_binary` + - `binary_view` - `string` - `large_string` - `string_view` From aa7f2a9aee0dc8e3636c69be10d13fd0b00d7e71 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:39:47 +0200 Subject: [PATCH 10/52] Fix polars cast column image (#7800) * fix polars cast_column issue * remove debug statements * cast large_strings to string for image handling --- src/datasets/features/image.py | 10 ++++++++++ tests/features/test_image.py | 12 ++++++++++++ tests/test_download_manager.py | 2 +- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py index fecc2fc5ccd..cb746b9219d 100644 --- a/src/datasets/features/image.py +++ b/src/datasets/features/image.py @@ -215,6 +215,7 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr The Arrow types that can be converted to the Image pyarrow storage type are: - `pa.string()` - it must contain the "path" data + - `pa.large_string()` - it must contain the "path" data (will be cast to string if possible) - `pa.binary()` - it must contain the image bytes - `pa.struct({"bytes": pa.binary()})` - `pa.struct({"path": pa.string()})` @@ -229,6 +230,15 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr `pa.StructArray`: Array in the Image arrow storage type, that is `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. """ + if pa.types.is_large_string(storage.type): + try: + storage = storage.cast(pa.string()) + except pa.ArrowInvalid as e: + raise ValueError( + f"Failed to cast large_string to string for Image feature. " + f"This can happen if string values exceed 2GB. " + f"Original error: {e}" + ) from e if pa.types.is_string(storage.type): bytes_array = pa.array([None] * len(storage), type=pa.binary()) storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null()) diff --git a/tests/features/test_image.py b/tests/features/test_image.py index 68e6f4b91cc..136b7ee9f6b 100644 --- a/tests/features/test_image.py +++ b/tests/features/test_image.py @@ -320,6 +320,18 @@ def test_dataset_cast_to_image_features(shared_datadir, build_data): assert isinstance(item["image"], PIL.Image.Image) +def test_dataset_cast_to_image_features_polars(shared_datadir): + import PIL.Image + + pl = pytest.importorskip("polars") + image_path = str(shared_datadir / "test_image_rgb.jpg") + df = pl.DataFrame({"image_path": [image_path]}) + dataset = Dataset.from_polars(df) + item = dataset.cast_column("image_path", Image())[0] + assert item.keys() == {"image_path"} + assert isinstance(item["image_path"], PIL.Image.Image) + + @require_pil def test_dataset_concatenate_image_features(shared_datadir): # we use a different data structure between 1 and 2 to make sure they are compatible with each other diff --git a/tests/test_download_manager.py b/tests/test_download_manager.py index 08eb77366c1..457bd9de49b 100644 --- a/tests/test_download_manager.py +++ b/tests/test_download_manager.py @@ -131,7 +131,7 @@ def test_download_manager_delete_extracted_files(xz_file): assert extracted_path == dl_manager.extracted_paths[xz_file] extracted_path = Path(extracted_path) parts = extracted_path.parts - # import pdb; pdb.set_trace() + assert parts[-1] == hash_url_to_filename(str(xz_file), etag=None) assert parts[-2] == extracted_subdir assert extracted_path.exists() From 3e13d30823f8ec498d56adbc18c6880a5463b313 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Mon, 13 Oct 2025 17:28:49 +0200 Subject: [PATCH 11/52] Allow streaming hdf5 files (#7814) allow streaming hdf5 files --- src/datasets/packaged_modules/hdf5/hdf5.py | 34 ++++++++++++---------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/datasets/packaged_modules/hdf5/hdf5.py b/src/datasets/packaged_modules/hdf5/hdf5.py index fb9100e1a0a..1b0e80aa6a8 100644 --- a/src/datasets/packaged_modules/hdf5/hdf5.py +++ b/src/datasets/packaged_modules/hdf5/hdf5.py @@ -61,8 +61,9 @@ def _split_generators(self, dl_manager): # Infer features from first file if self.info.features is None: for first_file in itertools.chain.from_iterable(files): - with h5py.File(first_file, "r") as h5: - self.info.features = _recursive_infer_features(h5) + with open(first_file, "rb") as f: + with h5py.File(f, "r") as h5: + self.info.features = _recursive_infer_features(h5) break splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files})) return splits @@ -73,22 +74,23 @@ def _generate_tables(self, files): batch_size_cfg = self.config.batch_size for file_idx, file in enumerate(itertools.chain.from_iterable(files)): try: - with h5py.File(file, "r") as h5: - # Infer features and lengths from first file - if self.info.features is None: - self.info.features = _recursive_infer_features(h5) - num_rows = _check_dataset_lengths(h5, self.info.features) - if num_rows is None: - logger.warning(f"File {file} contains no data, skipping...") - continue - effective_batch = batch_size_cfg or self._writer_batch_size or num_rows - for start in range(0, num_rows, effective_batch): - end = min(start + effective_batch, num_rows) - pa_table = _recursive_load_arrays(h5, self.info.features, start, end) - if pa_table is None: + with open(file, "rb") as f: + with h5py.File(f, "r") as h5: + # Infer features and lengths from first file + if self.info.features is None: + self.info.features = _recursive_infer_features(h5) + num_rows = _check_dataset_lengths(h5, self.info.features) + if num_rows is None: logger.warning(f"File {file} contains no data, skipping...") continue - yield f"{file_idx}_{start}", cast_table_to_features(pa_table, self.info.features) + effective_batch = batch_size_cfg or self._writer_batch_size or num_rows + for start in range(0, num_rows, effective_batch): + end = min(start + effective_batch, num_rows) + pa_table = _recursive_load_arrays(h5, self.info.features, start, end) + if pa_table is None: + logger.warning(f"File {file} contains no data, skipping...") + continue + yield f"{file_idx}_{start}", cast_table_to_features(pa_table, self.info.features) except ValueError as e: logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}") raise From 12f5acab3546abaa0e58b7d67740d5dcfdcb2bf6 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 17 Oct 2025 11:51:35 +0200 Subject: [PATCH 12/52] Retry open hf file (#7822) retry open hf file --- src/datasets/config.py | 2 + src/datasets/utils/file_utils.py | 64 ++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/datasets/config.py b/src/datasets/config.py index 908befa8c69..5e61e7bc015 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -247,6 +247,8 @@ # Streaming STREAMING_READ_MAX_RETRIES = 20 STREAMING_READ_RETRY_INTERVAL = 5 +STREAMING_OPEN_MAX_RETRIES = 20 +STREAMING_OPEN_RETRY_INTERVAL = 5 # Datasets repositories exploration DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200 diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 7a07f8cd267..01335a0c0dc 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -61,6 +61,14 @@ class _AiohttpClientError(Exception): T = TypeVar("T", str, Path) +CONNECTION_ERRORS_TO_RETRY = ( + _AiohttpClientError, + asyncio.TimeoutError, + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + httpx.RequestError, +) + def is_remote_url(url_or_filename: str) -> bool: return urlparse(url_or_filename).scheme != "" and not os.path.ismount(urlparse(url_or_filename).scheme + ":/") @@ -813,13 +821,7 @@ def read_with_retries(*args, **kwargs): try: out = read(*args, **kwargs) break - except ( - _AiohttpClientError, - asyncio.TimeoutError, - requests.exceptions.ConnectionError, - requests.exceptions.Timeout, - httpx.RequestError, - ) as err: + except CONNECTION_ERRORS_TO_RETRY as err: disconnect_err = err logger.warning( f"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_RETRY_INTERVAL}sec [{retry}/{max_retries}]" @@ -930,23 +932,37 @@ def xopen(file: str, mode="r", *args, download_config: Optional[DownloadConfig] # add headers and cookies for authentication on the HF Hub and for Google Drive file, storage_options = _prepare_path_and_storage_options(file_str, download_config=download_config) kwargs = {**kwargs, **(storage_options or {})} - try: - file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open() - except ValueError as e: - if str(e) == "Cannot seek streaming HTTP file": - raise NonStreamableDatasetError( - "Streaming is not possible for this dataset because data host server doesn't support HTTP range " - "requests. You can still load this dataset in non-streaming mode by passing `streaming=False` (default)" - ) from e - else: - raise - except FileNotFoundError: - if file.startswith(config.HF_ENDPOINT): - raise FileNotFoundError( - file + "\nIf the repo is private or gated, make sure to log in with `huggingface-cli login`." - ) from None - else: - raise + + max_retries = config.STREAMING_OPEN_MAX_RETRIES + + disconnect_err = None + for retry in range(1, max_retries + 1): + try: + file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open() + break + except CONNECTION_ERRORS_TO_RETRY as err: + disconnect_err = err + logger.warning( + f"Failed to connect to remote data host. Retrying in {config.STREAMING_OPEN_RETRY_INTERVAL}sec [{retry}/{max_retries}]" + ) + time.sleep(config.STREAMING_OPEN_RETRY_INTERVAL) + except ValueError as e: + if str(e) == "Cannot seek streaming HTTP file": + raise NonStreamableDatasetError( + "Streaming is not possible for this dataset because data host server doesn't support HTTP range " + "requests. You can still load this dataset in non-streaming mode by passing `streaming=False` (default)" + ) from e + else: + raise + except FileNotFoundError: + if file.startswith(config.HF_ENDPOINT): + raise FileNotFoundError( + file + "\nIf the repo is private or gated, make sure to log in with `huggingface-cli login`." + ) from None + else: + raise + else: + raise ConnectionError("Server Disconnected") from disconnect_err file_obj = _add_retries_to_file_obj_read_method(file_obj) return file_obj From 0b2a4c2dbf6a556ecca174a5ec2ecddbfbce2d86 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 17 Oct 2025 11:59:16 +0200 Subject: [PATCH 13/52] Keep hffs cache in workers when streaming (#7820) * keep hffs cache in workers when streaming * bonus: reorder hffs args to improve caching --- src/datasets/download/download_config.py | 2 +- src/datasets/iterable_dataset.py | 13 ++++++++++++- src/datasets/utils/file_utils.py | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/datasets/download/download_config.py b/src/datasets/download/download_config.py index b9e539fb053..6efae72b671 100644 --- a/src/datasets/download/download_config.py +++ b/src/datasets/download/download_config.py @@ -75,7 +75,7 @@ def copy(self) -> "DownloadConfig": def __setattr__(self, name, value): if name == "token" and getattr(self, "storage_options", None) is not None: if "hf" not in self.storage_options: - self.storage_options["hf"] = {"token": value, "endpoint": config.HF_ENDPOINT} + self.storage_options["hf"] = {"endpoint": config.HF_ENDPOINT, "token": value} elif getattr(self.storage_options["hf"], "token", None) is None: self.storage_options["hf"]["token"] = value super().__setattr__(name, value) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 7d16baa7d0d..e438b901694 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -26,7 +26,15 @@ import pandas as pd import pyarrow as pa import pyarrow.parquet as pq -from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi +from huggingface_hub import ( + CommitInfo, + CommitOperationAdd, + CommitOperationDelete, + DatasetCard, + DatasetCardData, + HfApi, + HfFileSystem, +) from huggingface_hub.hf_api import RepoFile from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError from multiprocess import Pool @@ -2151,6 +2159,7 @@ def __init__( self._token_per_repo_id: dict[str, Union[str, bool, None]] = token_per_repo_id or {} self._epoch: Union[int, "torch.Tensor"] = _maybe_share_with_torch_persistent_workers(0) self._starting_state_dict: Optional[dict] = None + self.__hffs_cache = HfFileSystem._cache # keep the cache on pickling (e.g. for dataloader workers) self._prepare_ex_iterable_for_iteration() # set state_dict _maybe_add_torch_iterable_dataset_parent_class(self.__class__) # subclass of torch IterableDataset @@ -2299,6 +2308,8 @@ def __setstate__(self, d): self.__dict__ = d # Re-add torch shared memory, since shared memory is not always kept when pickling self._epoch = _maybe_share_with_torch_persistent_workers(self._epoch) + # Re-add the cache to keep on pickling (e.g. for dataloader workers) + self.__hffs_cache = HfFileSystem._cache # Re-add torch iterable dataset as a parent class, since dynamically added parent classes are not kept when pickling _maybe_add_torch_iterable_dataset_parent_class(self.__class__) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 01335a0c0dc..37d79640d3c 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -897,8 +897,8 @@ def _prepare_single_hop_path_and_storage_options( storage_options["headers"] = {"Accept-Encoding": "identity", **headers} elif protocol == "hf": storage_options = { - "token": token, "endpoint": config.HF_ENDPOINT, + "token": token, **storage_options, } if storage_options: From 74c7154db6ca6fd714f8977240c4d2e993217ec2 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 20 Oct 2025 15:49:24 +0200 Subject: [PATCH 14/52] Fix batch_size default description in to_polars docstrings (#7824) --- src/datasets/arrow_dataset.py | 2 +- src/datasets/iterable_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c46733e71ee..0836a04ebe9 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5189,7 +5189,7 @@ def to_polars( Args: batch_size (`int`, *optional*): The size (number of rows) of the batches if `batched` is `True`. - Defaults to `genomicsml.datasets.config.DEFAULT_MAX_BATCH_SIZE`. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. batched (`bool`): Set to `True` to return a generator that yields the dataset as batches of `batch_size` rows. Defaults to `False` (returns the whole datasets once). diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index e438b901694..5c60567cc17 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -3671,7 +3671,7 @@ def to_polars( Args: batch_size (`int`, *optional*): The size (number of rows) of the batches if `batched` is `True`. - Defaults to `genomicsml.datasets.config.DEFAULT_MAX_BATCH_SIZE`. + Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. batched (`bool`): Set to `True` to return a generator that yields the dataset as batches of `batch_size` rows. Defaults to `False` (returns the whole datasets once). From fb445ff7979bcd5619de8aba2a32ba7ce3582a4e Mon Sep 17 00:00:00 2001 From: Ethan Knights Date: Mon, 20 Oct 2025 15:03:52 +0100 Subject: [PATCH 15/52] docs: document_dataset PDFs & OCR (#7812) * Update document_dataset.mdx * Update document_dataset.mdx OCR --- docs/source/document_dataset.mdx | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/document_dataset.mdx b/docs/source/document_dataset.mdx index 30cc1bd3121..bc2a8a229ef 100644 --- a/docs/source/document_dataset.mdx +++ b/docs/source/document_dataset.mdx @@ -1,13 +1,13 @@ # Create a document dataset -This guide will show you how to create a document dataset with `PdfFolder` and some metadata. This is a no-code solution for quickly creating a document dataset with several thousand pdfs. +This guide will show you how to create a document dataset with `PdfFolder` and some metadata. This is a no-code solution for quickly creating a document dataset with several thousand PDFs. > [!TIP] > You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information about how to enable this feature on the Hub. ## PdfFolder -The `PdfFolder` is a dataset builder designed to quickly load a document dataset with several thousand pdfs without requiring you to write any code. +The `PdfFolder` is a dataset builder designed to quickly load a document dataset with several thousand PDFs without requiring you to write any code. > [!TIP] > 💡 Take a look at the [Split pattern hierarchy](repository_structure#split-pattern-hierarchy) to learn more about how `PdfFolder` creates dataset splits based on your dataset repository structure. @@ -72,14 +72,14 @@ file_name,additional_feature or using `metadata.jsonl`: ```jsonl -{"file_name": "0001.pdf", "additional_feature": "This is a first value of a text feature you added to your pdfs"} -{"file_name": "0002.pdf", "additional_feature": "This is a second value of a text feature you added to your pdfs"} -{"file_name": "0003.pdf", "additional_feature": "This is a third value of a text feature you added to your pdfs"} +{"file_name": "0001.pdf", "additional_feature": "This is a first value of a text feature you added to your PDFs"} +{"file_name": "0002.pdf", "additional_feature": "This is a second value of a text feature you added to your PDFs"} +{"file_name": "0003.pdf", "additional_feature": "This is a third value of a text feature you added to your PDFs"} ``` Here the `file_name` must be the name of the PDF file next to the metadata file. More generally, it must be the relative path from the directory containing the metadata to the PDF file. -It's possible to point to more than one pdf in each row in your dataset, for example if both your input and output are pdfs: +It's possible to point to more than one PDF in each row in your dataset, for example if both your input and output are pdfs: ```jsonl {"input_file_name": "0001.pdf", "output_file_name": "0001_output.pdf"} @@ -87,7 +87,7 @@ It's possible to point to more than one pdf in each row in your dataset, for exa {"input_file_name": "0003.pdf", "output_file_name": "0003_output.pdf"} ``` -You can also define lists of pdfs. In that case you need to name the field `file_names` or `*_file_names`. Here is an example: +You can also define lists of PDFs. In that case you need to name the field `file_names` or `*_file_names`. Here is an example: ```jsonl {"pdfs_file_names": ["0001_part1.pdf", "0001_part2.pdf"], "label": "urgent"} @@ -95,9 +95,9 @@ You can also define lists of pdfs. In that case you need to name the field `file {"pdfs_file_names": ["0003_part1.pdf", "0002_part2.pdf"], "label": "normal"} ``` -### OCR (Optical character recognition) +### OCR (Optical Character Recognition) -OCR datasets have the text contained in a pdf. An example `metadata.csv` may look like: +OCR datasets have the text contained in a PDF. An example `metadata.csv` may look like: ```csv file_name,text @@ -106,7 +106,7 @@ file_name,text 0003.pdf,Attention is all you need. Abstract. The ... ``` -Load the dataset with `PdfFolder`, and it will create a `text` column for the pdf captions: +Load the dataset with `PdfFolder`, and it will create a `text` column for the PDF captions: ```py >>> dataset = load_dataset("pdffolder", data_dir="/path/to/folder", split="train") From d10e8468b442438f524d7045dd6c817146a4dee7 Mon Sep 17 00:00:00 2001 From: Mikhail Moskovchenko <43753582+simonreise@users.noreply.github.com> Date: Thu, 23 Oct 2025 20:25:57 +0400 Subject: [PATCH 16/52] Add custom fingerprint support to `from_generator` (#7533) * Add custom suffix support to from_generator * Renamed a new arg to fingerprint * Changed name to config_id in builder * Change version * Added a test * Version update * Update version * Update tests/test_arrow_dataset.py * Rename config_id to fingerprint in generator.py * Apply suggestions from code review * Update src/datasets/io/generator.py * Apply suggestions from code review --------- Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- src/datasets/arrow_dataset.py | 8 ++++++++ src/datasets/builder.py | 13 ++++++++----- src/datasets/io/generator.py | 5 +++++ tests/test_arrow_dataset.py | 10 ++++++++++ 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 0836a04ebe9..43301d23041 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1120,6 +1120,7 @@ def from_generator( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, + fingerprint: Optional[str] = None, **kwargs, ): """Create a Dataset from a generator. @@ -1146,6 +1147,12 @@ def from_generator( Split name to be assigned to the dataset. + fingerprint (`str`, *optional*): + Fingerprint that will be used to generate dataset ID. + By default `fingerprint` is generated by hashing the generator function and all the args which can be slow + if it uses large objects like AI models. + + **kwargs (additional keyword arguments): Keyword arguments to be passed to :[`GeneratorConfig`]. @@ -1183,6 +1190,7 @@ def from_generator( gen_kwargs=gen_kwargs, num_proc=num_proc, split=split, + fingerprint=fingerprint, **kwargs, ).read() diff --git a/src/datasets/builder.py b/src/datasets/builder.py index e63960dcabf..b88aa0bf8f9 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -313,6 +313,7 @@ def __init__( data_dir: Optional[str] = None, storage_options: Optional[dict] = None, writer_batch_size: Optional[int] = None, + config_id: Optional[str] = None, **config_kwargs, ): # DatasetBuilder name @@ -343,6 +344,7 @@ def __init__( self.config, self.config_id = self._create_builder_config( config_name=config_name, custom_features=features, + config_id=config_id, **config_kwargs, ) @@ -502,7 +504,7 @@ def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> st return legacy_relative_data_dir def _create_builder_config( - self, config_name=None, custom_features=None, **config_kwargs + self, config_name=None, custom_features=None, config_id=None, **config_kwargs ) -> tuple[BuilderConfig, str]: """Create and validate BuilderConfig object as well as a unique config id for this config. Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None. @@ -570,10 +572,11 @@ def _create_builder_config( ) # compute the config id that is going to be used for caching - config_id = builder_config.create_config_id( - config_kwargs, - custom_features=custom_features, - ) + if config_id is None: + config_id = builder_config.create_config_id( + config_kwargs, + custom_features=custom_features, + ) is_custom = (config_id not in self.builder_configs) and config_id != "default" if is_custom: logger.info(f"Using custom data configuration {config_id}") diff --git a/src/datasets/io/generator.py b/src/datasets/io/generator.py index b10609cac23..6c1eaee9b0f 100644 --- a/src/datasets/io/generator.py +++ b/src/datasets/io/generator.py @@ -16,6 +16,7 @@ def __init__( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, + fingerprint: Optional[str] = None, **kwargs, ): super().__init__( @@ -32,8 +33,10 @@ def __init__( generator=generator, gen_kwargs=gen_kwargs, split=split, + config_id="default-fingerprint=" + fingerprint if fingerprint else None, **kwargs, ) + self.fingerprint = fingerprint def read(self): # Build iterable dataset @@ -56,4 +59,6 @@ def read(self): dataset = self.builder.as_dataset( split=self.builder.config.split, verification_mode=verification_mode, in_memory=self.keep_in_memory ) + if self.fingerprint: + dataset._fingerprint = self.fingerprint return dataset diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 4661e8c6dd7..8e76952d6ca 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -4114,6 +4114,16 @@ def test_dataset_from_generator_split(split, data_generator, tmp_path): _check_generator_dataset(dataset, expected_features, expected_split) +@pytest.mark.parametrize("fingerprint", [None, "test-dataset"]) +def test_dataset_from_generator_fingerprint(fingerprint, data_generator, tmp_path): + cache_dir = tmp_path / "cache" + expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"} + dataset = Dataset.from_generator(data_generator, cache_dir=cache_dir, fingerprint=fingerprint) + _check_generator_dataset(dataset, expected_features, NamedSplit("train")) + if fingerprint: + assert dataset._fingerprint == fingerprint + + @require_not_windows @require_dill_gt_0_3_2 @require_pyspark From 93326496dc4f67d28e521f29d91277a5118baecd Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 23 Oct 2025 18:29:06 +0200 Subject: [PATCH 17/52] picklable batch_fn (#7826) --- src/datasets/iterable_dataset.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 5c60567cc17..7f53bc7372a 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -3581,15 +3581,12 @@ def batch(self, batch_size: int, drop_last_batch: bool = False) -> "IterableData ``` """ - def batch_fn(unbatched): - return {k: [v] for k, v in unbatched.items()} - if self.features: features = Features({col: List(feature) for col, feature in self.features.items()}) else: features = None return self.map( - batch_fn, batched=True, batch_size=batch_size, drop_last_batch=drop_last_batch, features=features + _batch_fn, batched=True, batch_size=batch_size, drop_last_batch=drop_last_batch, features=features ) def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Union[dict, Iterator[dict]]: @@ -4659,3 +4656,7 @@ async def _apply_async(pool, func, x): return future.get() else: await asyncio.sleep(0) + + +def _batch_fn(unbatched): + return {k: [v] for k, v in unbatched.items()} From 41c05299348a499807432ab476e1cdc4143c8772 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 23 Oct 2025 18:31:56 +0200 Subject: [PATCH 18/52] release: 4.3.0 (#7827) --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 8641202a908..ab95c8bbd03 100644 --- a/setup.py +++ b/setup.py @@ -228,7 +228,7 @@ setup( name="datasets", - version="4.2.1.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.3.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index f5b5a8c9892..2beccf63bda 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.2.1.dev0" +__version__ = "4.3.0" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From 159a6451f103fb886b90c5ff1457a8f5c943b783 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Thu, 23 Oct 2025 18:34:50 +0200 Subject: [PATCH 19/52] set dev version (#7828) --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ab95c8bbd03..8dc56ef518d 100644 --- a/setup.py +++ b/setup.py @@ -228,7 +228,7 @@ setup( name="datasets", - version="4.3.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.3.1.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 2beccf63bda..afa6bf0c9c0 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.3.0" +__version__ = "4.3.1.dev0" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From 51388765829087b7c7dfea3eb688571865e23ba1 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Fri, 24 Oct 2025 15:22:17 +0200 Subject: [PATCH 20/52] Add nifti support (#7815) * Add nifti support * update docs * update nifti after testing locally and from remote hub * update setup.py to add nibabel and update docs * add nifti_dataset * fix nifti dataset documentation * add nibabel to test dependency * Add section for creating a medical imaging dataset --------- Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- docs/source/_toctree.yml | 2 + docs/source/nifti_dataset.mdx | 130 ++++++++++ .../package_reference/loading_methods.mdx | 6 + .../source/package_reference/main_classes.mdx | 4 + setup.py | 4 + src/datasets/config.py | 1 + src/datasets/features/__init__.py | 2 + src/datasets/features/features.py | 6 + src/datasets/features/nifti.py | 243 ++++++++++++++++++ src/datasets/packaged_modules/__init__.py | 5 + .../packaged_modules/niftifolder/__init__.py | 0 .../niftifolder/niftifolder.py | 23 ++ tests/features/data/test_nifti.nii | Bin 0 -> 128352 bytes tests/features/data/test_nifti.nii.gz | Bin 0 -> 33209 bytes tests/features/test_nifti.py | 91 +++++++ tests/utils.py | 12 + 16 files changed, 529 insertions(+) create mode 100644 docs/source/nifti_dataset.mdx create mode 100644 src/datasets/features/nifti.py create mode 100644 src/datasets/packaged_modules/niftifolder/__init__.py create mode 100644 src/datasets/packaged_modules/niftifolder/niftifolder.py create mode 100644 tests/features/data/test_nifti.nii create mode 100644 tests/features/data/test_nifti.nii.gz create mode 100644 tests/features/test_nifti.py diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 861925a7d99..cc6b7195fe2 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -88,6 +88,8 @@ title: Load document data - local: document_dataset title: Create a document dataset + - local: nifti_dataset + title: Create a medical imaging dataset title: "Vision" - sections: - local: nlp_load diff --git a/docs/source/nifti_dataset.mdx b/docs/source/nifti_dataset.mdx new file mode 100644 index 00000000000..2770460fbaf --- /dev/null +++ b/docs/source/nifti_dataset.mdx @@ -0,0 +1,130 @@ +# Create a NIfTI dataset + +This page shows how to create and share a dataset of medical images in NIfTI format (.nii / .nii.gz) using the `datasets` library. + +You can share a dataset with your team or with anyone in the community by creating a dataset repository on the Hugging Face Hub: + +```py +from datasets import load_dataset + +dataset = load_dataset("/my_nifti_dataset") +``` + +There are two common ways to create a NIfTI dataset: + +- Create a dataset from local NIfTI files in Python and upload it with `Dataset.push_to_hub`. +- Use a folder-based convention (one file per example) and a small helper to convert it into a `Dataset`. + +> [!TIP] +> You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information. + +## Local files + +If you already have a list of file paths to NIfTI files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Nifti` feature. + +```py +from datasets import Dataset +from datasets import Nifti + +# simple example: create a dataset from file paths +files = ["/path/to/scan_001.nii.gz", "/path/to/scan_002.nii.gz"] +ds = Dataset.from_dict({"nifti": files}).cast_column("nifti", Nifti()) + +# access a decoded nibabel image (if decode=True) +# ds[0]["nifti"] will be a nibabel.Nifti1Image object when decode=True +# or a dict {'bytes': None, 'path': '...'} when decode=False +``` + +The `Nifti` feature supports a `decode` parameter. When `decode=True` (the default), it loads the NIfTI file into a `nibabel.nifti1.Nifti1Image` object. You can access the image data as a numpy array with `img.get_fdata()`. When `decode=False`, it returns a dict with the file path and bytes. + +```py +from datasets import Dataset, Nifti + +ds = Dataset.from_dict({"nifti": ["/path/to/scan.nii.gz"]}).cast_column("nifti", Nifti(decode=True)) +img = ds[0]["nifti"] # instance of: nibabel.nifti1.Nifti1Image +arr = img.get_fdata() +``` + +After preparing the dataset you can push it to the Hub: + +```py +ds.push_to_hub("/my_nifti_dataset") +``` + +This will create a dataset repository containing your NIfTI dataset with a `data/` folder of parquet shards. + +## Folder conventions and metadata + +If you organize your dataset in folders you can create splits automatically (train/test/validation) by following a structure like: + +``` +dataset/train/scan_0001.nii +dataset/train/scan_0002.nii +dataset/validation/scan_1001.nii +dataset/test/scan_2001.nii +``` + +If you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the NIfTI file next to the metadata file. + +Example `metadata.csv`: + +```csv +file_name,patient_id,age,diagnosis +scan_0001.nii.gz,P001,45,healthy +scan_0002.nii.gz,P002,59,disease_x +``` + +The `Nifti` feature works with zipped datasets too — each zip can contain NIfTI files and a metadata file. This is useful when uploading large datasets as archives. +This means your dataset structure could look like this (mixed compressed and uncompressed files): +``` +dataset/train/scan_0001.nii.gz +dataset/train/scan_0002.nii +dataset/validation/scan_1001.nii.gz +dataset/test/scan_2001.nii +``` + +## Converting to PyTorch tensors + +Use the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset: + +```py +import torch +import nibabel +import numpy as np + +def transform_to_pytorch(example): + example["nifti_torch"] = [torch.tensor(ex.get_fdata()) for ex in example["nifti"]] + return example + +ds.set_transform(transform_to_pytorch) + +``` +Accessing elements now (e.g. `ds[0]`) will yield torch tensors in the `"nifti_torch"` key. + + +## Usage of NifTI1Image + +NifTI is a format to store the result of 3 (or even 4) dimensional brain scans. This includes 3 spatial dimensions (x,y,z) +and optionally a time dimension (t). Furthermore, the given positions here are only relative to the scanner, therefore +the dimensions (4, 5, 6) are used to lift this to real world coordinates. + +You can visualize nifti files for instance leveraging `matplotlib` as follows: +```python +import matplotlib.pyplot as plt +from datasets import load_dataset + +def show_slices(slices): + """ Function to display row of image slices """ + fig, axes = plt.subplots(1, len(slices)) + for i, slice in enumerate(slices): + axes[i].imshow(slice.T, cmap="gray", origin="lower") + +nifti_ds = load_dataset("/my_nifti_dataset") +for epi_img in nifti_ds: + nifti_img = epi_img["nifti"].get_fdata() + show_slices([nifti_img[:, :, 16], nifti_img[26, :, :], nifti_img[:, 30, :]]) + plt.show() +``` + +For further reading we refer to the [nibabel documentation](https://nipy.org/nibabel/index.html) and especially [this nibabel tutorial](https://nipy.org/nibabel/coordinate_systems.html) +--- diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx index 786679636e7..4792d1b88f7 100644 --- a/docs/source/package_reference/loading_methods.mdx +++ b/docs/source/package_reference/loading_methods.mdx @@ -103,6 +103,12 @@ load_dataset("csv", data_dir="path/to/data/dir", sep="\t") [[autodoc]] datasets.packaged_modules.pdffolder.PdfFolder +### Nifti + +[[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolderConfig + +[[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolder + ### WebDataset [[autodoc]] datasets.packaged_modules.webdataset.WebDataset diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx index 299dd765d13..84e651f9171 100644 --- a/docs/source/package_reference/main_classes.mdx +++ b/docs/source/package_reference/main_classes.mdx @@ -271,6 +271,10 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable [[autodoc]] datasets.Pdf +### Nifti + +[[autodoc]] datasets.Nifti + ## Filesystems [[autodoc]] datasets.filesystems.is_remote_filesystem diff --git a/setup.py b/setup.py index 8dc56ef518d..06eee6717c8 100644 --- a/setup.py +++ b/setup.py @@ -186,6 +186,7 @@ "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced "torchcodec>=0.7.0", # minium version to get windows support + "nibabel>=5.3.1", ] NUMPY2_INCOMPATIBLE_LIBRARIES = [ @@ -207,6 +208,8 @@ PDFS_REQUIRE = ["pdfplumber>=0.11.4"] +NIBABEL_REQUIRE = ["nibabel>=5.3.2"] + EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, "vision": VISION_REQUIRE, @@ -224,6 +227,7 @@ "benchmarks": BENCHMARKS_REQUIRE, "docs": DOCS_REQUIRE, "pdfs": PDFS_REQUIRE, + "nibabel": NIBABEL_REQUIRE, } setup( diff --git a/src/datasets/config.py b/src/datasets/config.py index 5e61e7bc015..3d3f12b008d 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -139,6 +139,7 @@ TORCHCODEC_AVAILABLE = importlib.util.find_spec("torchcodec") is not None TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None +NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None # Optional compression tools RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py index 36133ce5e5a..40a3568039a 100644 --- a/src/datasets/features/__init__.py +++ b/src/datasets/features/__init__.py @@ -15,10 +15,12 @@ "TranslationVariableLanguages", "Video", "Pdf", + "Nifti", ] from .audio import Audio from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value from .image import Image +from .nifti import Nifti from .pdf import Pdf from .translation import Translation, TranslationVariableLanguages from .video import Video diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 54d84ef33e2..88259767ae0 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -42,6 +42,7 @@ from ..utils.py_utils import asdict, first_non_null_value, zip_dict from .audio import Audio from .image import Image, encode_pil_image +from .nifti import Nifti from .pdf import Pdf, encode_pdfplumber_pdf from .translation import Translation, TranslationVariableLanguages from .video import Video @@ -1270,6 +1271,7 @@ def __repr__(self): Image, Video, Pdf, + Nifti, ] @@ -1428,6 +1430,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni Image.__name__: Image, Video.__name__: Video, Pdf.__name__: Pdf, + Nifti.__name__: Nifti, } @@ -1761,6 +1764,9 @@ class Features(dict): - [`Pdf`] feature to store the absolute path to a PDF file, a `pdfplumber.pdf.PDF` object or a dictionary with the relative path to a PDF file ("path" key) and its bytes content ("bytes" key). This feature loads the PDF lazily with a PDF reader. + - [`Nifti`] feature to store the absolute path to a NIfTI neuroimaging file, a `nibabel.Nifti1Image` object + or a dictionary with the relative path to a NIfTI file ("path" key) and its bytes content ("bytes" key). + This feature loads the NIfTI file lazily with nibabel. - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation. """ diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py new file mode 100644 index 00000000000..bac91e2af4b --- /dev/null +++ b/src/datasets/features/nifti.py @@ -0,0 +1,243 @@ +import os +from dataclasses import dataclass, field +from io import BytesIO +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union + +import pyarrow as pa + +from .. import config +from ..download.download_config import DownloadConfig +from ..table import array_cast +from ..utils.file_utils import is_local_path, xopen +from ..utils.py_utils import string_to_dict + + +if TYPE_CHECKING: + import nibabel as nib + + from .features import FeatureType + + +@dataclass +class Nifti: + """ + **Experimental.** + Nifti [`Feature`] to read NIfTI neuroimaging files. + + Input: The Nifti feature accepts as input: + - A `str`: Absolute path to the NIfTI file (i.e. random access is allowed). + - A `pathlib.Path`: path to the NIfTI file (i.e. random access is allowed). + - A `dict` with the keys: + - `path`: String with relative path of the NIfTI file in a dataset repository. + - `bytes`: Bytes of the NIfTI file. + This is useful for archived files with sequential access. + + - A `nibabel` image object (e.g., `nibabel.nifti1.Nifti1Image`). + + Args: + decode (`bool`, defaults to `True`): + Whether to decode the NIfTI data. If `False` a string with the bytes is returned. `decode=False` is not supported when decoding examples. + + Examples: + + ```py + >>> from datasets import Dataset, Nifti + >>> ds = Dataset.from_dict({"nifti": ["path/to/file.nii.gz"]}).cast_column("nifti", Nifti()) + >>> ds.features["nifti"] + Nifti(decode=True, id=None) + >>> ds[0]["nifti"] + + >>> ds = ds.cast_column("nifti", Nifti(decode=False)) + >>> ds[0]["nifti"] + {'bytes': None, + 'path': 'path/to/file.nii.gz'} + ``` + """ + + decode: bool = True + id: Optional[str] = field(default=None, repr=False) + + # Automatically constructed + dtype: ClassVar[str] = "nibabel.nifti1.Nifti1Image" + pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()}) + _type: str = field(default="Nifti", init=False, repr=False) + + def __call__(self): + return self.pa_type + + def encode_example(self, value: Union[str, bytes, bytearray, dict, "nib.Nifti1Image"]) -> dict: + """Encode example into a format for Arrow. + + Args: + value (`str`, `bytes`, `nibabel.Nifti1Image` or `dict`): + Data passed as input to Nifti feature. + + Returns: + `dict` with "path" and "bytes" fields + """ + if config.NIBABEL_AVAILABLE: + import nibabel as nib + else: + nib = None + + if isinstance(value, str): + return {"path": value, "bytes": None} + elif isinstance(value, Path): + return {"path": str(value.absolute()), "bytes": None} + elif isinstance(value, (bytes, bytearray)): + return {"path": None, "bytes": value} + elif nib is not None and isinstance(value, nib.spatialimages.SpatialImage): + # nibabel image object - try to get path or convert to bytes + return encode_nibabel_image(value) + elif isinstance(value, dict): + if value.get("path") is not None and os.path.isfile(value["path"]): + # we set "bytes": None to not duplicate the data if they're already available locally + return {"bytes": None, "path": value.get("path")} + elif value.get("bytes") is not None or value.get("path") is not None: + # store the nifti bytes, and path is used to infer the format using the file extension + return {"bytes": value.get("bytes"), "path": value.get("path")} + else: + raise ValueError( + f"A nifti sample should have one of 'path' or 'bytes' but they are missing or None in {value}." + ) + else: + raise ValueError( + f"A nifti sample should be a string, bytes, Path, nibabel image, or dict, but got {type(value)}." + ) + + def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nifti1Image": + """Decode example NIfTI file into nibabel image object. + + Args: + value (`str` or `dict`): + A string with the absolute NIfTI file path, a dictionary with + keys: + + - `path`: String with absolute or relative NIfTI file path. + - `bytes`: The bytes of the NIfTI file. + + token_per_repo_id (`dict`, *optional*): + To access and decode NIfTI files from private repositories on + the Hub, you can pass a dictionary + repo_id (`str`) -> token (`bool` or `str`). + + Returns: + `nibabel.Nifti1Image` objects + """ + if not self.decode: + raise NotImplementedError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.") + + if config.NIBABEL_AVAILABLE: + import nibabel as nib + else: + raise ImportError("To support decoding NIfTI files, please install 'nibabel'.") + + if token_per_repo_id is None: + token_per_repo_id = {} + + path, bytes_ = value["path"], value["bytes"] + if bytes_ is None: + if path is None: + raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.") + else: + if is_local_path(path): + nifti = nib.load(path) + else: + source_url = path.split("::")[-1] + pattern = ( + config.HUB_DATASETS_URL + if source_url.startswith(config.HF_ENDPOINT) + else config.HUB_DATASETS_HFFS_URL + ) + try: + repo_id = string_to_dict(source_url, pattern)["repo_id"] + token = token_per_repo_id.get(repo_id) + except ValueError: + token = None + download_config = DownloadConfig(token=token) + with xopen(path, "rb", download_config=download_config) as f: + nifti = nib.load(f) + else: + import gzip + + if ( + bytes_[:2] == b"\x1f\x8b" + ): # gzip magic number, see https://stackoverflow.com/a/76055284/9534390 or "Magic number" on https://en.wikipedia.org/wiki/Gzip + bytes_ = gzip.decompress(bytes_) + + bio = BytesIO(bytes_) + fh = nib.FileHolder(fileobj=bio) + nifti = nib.Nifti1Image.from_file_map({"header": fh, "image": fh}) + + return nifti + + def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]: + """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.""" + from .features import Value + + return ( + self + if self.decode + else { + "bytes": Value("binary"), + "path": Value("string"), + } + ) + + def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.BinaryArray]) -> pa.StructArray: + """Cast an Arrow array to the Nifti arrow storage type. + The Arrow types that can be converted to the Nifti pyarrow storage type are: + + - `pa.string()` - it must contain the "path" data + - `pa.binary()` - it must contain the NIfTI bytes + - `pa.struct({"bytes": pa.binary()})` + - `pa.struct({"path": pa.string()})` + - `pa.struct({"bytes": pa.binary(), "path": pa.string()})` - order doesn't matter + + Args: + storage (`Union[pa.StringArray, pa.StructArray, pa.BinaryArray]`): + PyArrow array to cast. + + Returns: + `pa.StructArray`: Array in the Nifti arrow storage type, that is + `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. + """ + if pa.types.is_string(storage.type): + bytes_array = pa.array([None] * len(storage), type=pa.binary()) + storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null()) + elif pa.types.is_binary(storage.type): + path_array = pa.array([None] * len(storage), type=pa.string()) + storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null()) + elif pa.types.is_struct(storage.type): + if storage.type.get_field_index("bytes") >= 0: + bytes_array = storage.field("bytes") + else: + bytes_array = pa.array([None] * len(storage), type=pa.binary()) + if storage.type.get_field_index("path") >= 0: + path_array = storage.field("path") + else: + path_array = pa.array([None] * len(storage), type=pa.string()) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null()) + return array_cast(storage, self.pa_type) + + +def encode_nibabel_image(img: "nib.Nifti1Image") -> dict[str, Optional[Union[str, bytes]]]: + """ + Encode a nibabel image object into a dictionary. + + If the image has an associated file path, returns the path. Otherwise, serializes + the image content into bytes. + + Args: + img: A nibabel image object (e.g., Nifti1Image). + + Returns: + dict: A dictionary with "path" or "bytes" field. + """ + if hasattr(img, "file_map") and img.file_map is not None: + filename = img.file_map["image"].filename + return {"path": filename, "bytes": None} + + bytes_data = img.to_bytes() + return {"path": None, "bytes": bytes_data} diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index 515ff147b29..9d076df44b7 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -11,6 +11,7 @@ from .hdf5 import hdf5 from .imagefolder import imagefolder from .json import json +from .niftifolder import niftifolder from .pandas import pandas from .parquet import parquet from .pdffolder import pdffolder @@ -46,6 +47,7 @@ def _hash_python_lines(lines: list[str]) -> str: "audiofolder": (audiofolder.__name__, _hash_python_lines(inspect.getsource(audiofolder).splitlines())), "videofolder": (videofolder.__name__, _hash_python_lines(inspect.getsource(videofolder).splitlines())), "pdffolder": (pdffolder.__name__, _hash_python_lines(inspect.getsource(pdffolder).splitlines())), + "niftifolder": (niftifolder.__name__, _hash_python_lines(inspect.getsource(niftifolder).splitlines())), "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())), "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())), "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())), @@ -89,6 +91,8 @@ def _hash_python_lines(lines: list[str]) -> str: _EXTENSION_TO_MODULE.update({ext.upper(): ("videofolder", {}) for ext in videofolder.VideoFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext: ("pdffolder", {}) for ext in pdffolder.PdfFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext.upper(): ("pdffolder", {}) for ext in pdffolder.PdfFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext: ("niftifolder", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext.upper(): ("niftifolder", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS}) # Used to filter data files based on extensions given a module name _MODULE_TO_EXTENSIONS: dict[str, list[str]] = {} @@ -106,3 +110,4 @@ def _hash_python_lines(lines: list[str]) -> str: _MODULE_TO_METADATA_FILE_NAMES["audiofolder"] = imagefolder.ImageFolder.METADATA_FILENAMES _MODULE_TO_METADATA_FILE_NAMES["videofolder"] = imagefolder.ImageFolder.METADATA_FILENAMES _MODULE_TO_METADATA_FILE_NAMES["pdffolder"] = imagefolder.ImageFolder.METADATA_FILENAMES +_MODULE_TO_METADATA_FILE_NAMES["niftifolder"] = imagefolder.ImageFolder.METADATA_FILENAMES diff --git a/src/datasets/packaged_modules/niftifolder/__init__.py b/src/datasets/packaged_modules/niftifolder/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/packaged_modules/niftifolder/niftifolder.py b/src/datasets/packaged_modules/niftifolder/niftifolder.py new file mode 100644 index 00000000000..c6d039419f1 --- /dev/null +++ b/src/datasets/packaged_modules/niftifolder/niftifolder.py @@ -0,0 +1,23 @@ +import datasets + +from ..folder_based_builder import folder_based_builder + + +logger = datasets.utils.logging.get_logger(__name__) + + +class NiftiFolderConfig(folder_based_builder.FolderBasedBuilderConfig): + """BuilderConfig for NiftiFolder.""" + + drop_labels: bool = None + drop_metadata: bool = None + + def __post_init__(self): + super().__post_init__() + + +class NiftiFolder(folder_based_builder.FolderBasedBuilder): + BASE_FEATURE = datasets.Nifti + BASE_COLUMN_NAME = "nifti" + BUILDER_CONFIG_CLASS = NiftiFolderConfig + EXTENSIONS: list[str] = [".nii", ".nii.gz"] diff --git a/tests/features/data/test_nifti.nii b/tests/features/data/test_nifti.nii new file mode 100644 index 0000000000000000000000000000000000000000..c1d560658c6c081d065b4b368ec0881b94e67973 GIT binary patch literal 128352 zcmeFa2fQUmaqmBK?%kYo&N(M-&QY85YID*CX{B9BtF+Q8M+C|NNg$Af1R@Dc&RBrK zWMd4^0sObI4Hz&^595HJ?f3mosXgc1y}MdT81whLb3W&snV#8_gIm1k#l z)Me}Ww9fJuKedVKvyMf;&GSF}p?@g8SvGKCZ5*%Ox47HJx6JpfRhCu!Z|qoq;gO?9 zF1upjnnMSV9N2$!;HGP@xbo1kD+Vq+aN&_dSMMJ zspmg--Oq1&;m5N+w&=V6{KBFw@A&Fxv+U*qXD!9E4a>4i#!RYB+H}>#fhG<#ap3BomU_eRWm+;@6}5A)uvAxCY|_H zK-T=lPkmPZE9PRtQ~RGY{PipE>fl{-zV%tl;@g}M@Ba54iF0+}5o5~&S8C>h0 zb>^yF)+Ote_26l#E#Y<53C`mDm2&%3(mC8|4u(zz$J+BhARCem$_8e=K_~~iAm_8^ zVA2=0_SK<`^lDrMZgqlx>RShR);sIRXF%4KPyehB;T1k-r+-Q#J_+gBrE83H>RU5p z+Y*`fLXxex?vV`zUuQn8z_ub=nvDRtwdbW;?7(9ki%;U9_-Z*BF;7KQ8=_?8UK{f7 zpN-C@X7l;>N3tW4aF1*Z=r(3Evz~+ueo@9}=7sr+?^ON;qFR)-Xd1h!&sq@N4qkQ7 zhGny|RoMnUtFlGef^2R!FPn&Lhh&qua}F5WzR1JVV+<6H%U9&5PuUo{9CYoFZkV!EtnSae;~U8w5!m;g~SU8r)=tr(~ zl(z-A9ACs)!kO?X;i%QMC6xCK4||ICw8N7?I3rt^U638+dk~~E!7(0WtF!aDzaYCb zyFNRS9pQQqdfMp}{z*GV79m&17UB#}+N*nn{NuL>XzJjWTCI=O z;1Y#~VO?nRW@XDjb1uL0!8QquTS0O)HF!0AdtdgY?B47R*%v^#l-TpL3E-=*CiI^Y zQ-*<{8Jg3k>LWt?#}vc=hTN(lSzP3_dE>#i03_R}4_mWMd@n@%PG-0Ac}@1_?9D^q-v)dox~MHLjjsZ}5pZO4q5p@_ zy^FwhU3M+soB7_6y)}D3pGT;_3#c`R(a|~Zuv3v+H9qrBvh%zp$hx7agD6WM^rRaI zqh?rll0Hg7$u@Qv+Ke7-6nU?Mf2*=RNcDJj8MXBM>|)L?1<~zj-nB@23S7P;yN!Eu z$+JtLe@4(l+PppB9?I z_n}4WXAN0R4>1is&4=nHK4M+P^}M3~ETV7QlN|&3474Iz+*-LL8P~(Te$?`*=)?-7 zGZma;N!^-K$vyR;#@ZLGhiJEeF}MYzfqtP@7k9zLMH8p4Mdr zyqyX!S~i-422u3pqq zYJuJkqKxuy1@&ZoA=6d-Z{m76A?u2kXB}5dkdyT=YHxMv%e+Dl?LFER?Q@5sg&9CP z`4%ludZwuMF*KxMyMWh>wQNbGUPl0#6Y1h=fz>#RB zR>B(GVP4TT%;9$?pQ+p#1@}_>VCrfP#I+dYOnh6SJC6SB zFGLi-&Wjo2%tz3R99k3{>_gizwBVv`^sJ&!nO*$OgRkRh!Mee{Rt2iyMQsjJRHkxD zx)owOCWzV;a`j$A-L0ikdlgcVCzi4!y|GVf=g^BxqLrHn-)8Vx0xt)H&gYa~Md@pK zmuQO5T4DdIg!4ile6o(vz}9%$L?iwqPN2)7R)>t+!5hayJ;5@B`ZB!e*T;f#3^k)C z=)2SAM(-(=;C(e+YfVTtMkqd4I~UQG(g*gs>gZ>AZp!EvogwO4jMjV_9|o?m;2g@i zGeou&jxps-NU&iBB9>F3gF!d+FKK1y#i=E0NU8Q(srYSGv{$zK&a63p>jlSbV{Kop zpI)9bw;px$pxdfM^ zoiV=jOZ__=3MDPiOdZb$j)2t2HXv$K;B!vU-YQy#+H@~{mC#jl^`%plF-(`zs^Qi) zkyzRo?S^(n>s6+ErDrdncaf745UCmyt-R9lJdg$ASBmQ8r%OCDWyruY6w^#g{7Wy*1;)&K!jR?yoxL<}{Bh+E34jGgo+C>dVTD$lAzBW=hpQFrA`OWv@PbG4B`dugLx%C4e!JXtE+2Tq`Kza9CQNN(eR-7UC2y2o zYCSD?$l5fi3}yN=pBZ1RX;EmuR!JR@=`dE%`-15lEGA~N+CR6jj+lydf;Q!++Fm7$ zYD!q4%ovC!wPW_zk{$p&@xM#RBbxOaZej^ zkAy1P3a`Ychk-RpiNxxDJHo=IM?MB$3j$~HqweV~qwPVzLZVvXXs@ju?0BpY24hpS z7Hl)%<2bH|Q39=jst<96OttpuiVo=0qNP?>(1wSLHva94`c<2@noeWquIpRo-q;&V z0M%TO^nq3vuuQ~)?Ic#H=X2H@8<_EhJwgxeHnemwPc2HqmMPY_t@XA^THEzRlY5{u zS`26*5b|#fPlToRr$Sy`NqG*Rso3~kk45a;vPZG(+=eCMHQ8g>?ruQV8?Z~g0ZWsS zXktIs`y1+@^tANR8bE)g*8Fya@&L*@f>KXFQ^Kc2{Y@+2rG}J9BTR|JDtR078;T{< z8mz&lp<(N?z1W$)8O!!tv)94BY1lno#no$&^9XEFw-hzHLk$PjVcUSNaA*h`(69I( z53Z4DiFRl6(HPGMa;F36!``?tE+)>2&~~{WavY!ClD#8)CVMSbu?OJQdhASZgqN3q zW<6G@H-U8{xjmA7fRZdm6SUASOK+`Ewt7L>3Tb0Lf|AXK+v6$M@@xSbvKTwZ#avA+ zS^@jc#_Z&y2bFeMs`N#&BhddtaO+ttogc^^1lf`7a^!g{R>qfsQM$9Ur?TI{BL7@U zF$a#ff@4Vw9p`8*Zhh!OjT!)-hGQeO2%VWs+SUAb=gtDGZ8wphejHk^*fKUeltdhk z0$P&St?1YhsObf8jNG5Z>iLb>72g8RE3qwJN-eo0dw2HR$aomm(2tY<7_it^BudS} z+W`!=o#Wxp1bB8H<(dOJy+_7@dMwvRkoFKt(y`#U9BOzZ%$DA|KP9^>`*8L#H12$~ zWIf&i$Kl#d{2qg+d!Tp)w(j?W_%ibPK=%KEbu4E+3hjv2)?Y0#dm+yy@OufEZa`O$ zqZ z)ZNFkhp5XBa(;QXkNW&X_6_*hk6iX3Eo))4?&hT~w*$oia9s|q-#{+6(gIviXv!s& zeiyz5w_^jo0$Gg)V{I!Mh}1eUNBtc|+S@46GU~%g&K?HM-Ei+7@O~C~ei*JkfRqmw zwBMHf7P)Rli=HIc&IJb??If=`Jr?FuqIXlGXW;k=s9u%5o>Jbz*|BVYf&FfD<^rtw z$Dj?>bu9IJiKE&BO^07^1HoQ&Y#F~-bA2O{xeb}W5xRH6fj!{akEA}5{Z;lw@|?`w z_Y$JrrEPLs&@9)@C4Y3=IuZR7eN0?a2|!~jokSloP3@-@uuv4IB_fYuZBZ+p#A;8 zr+sakqD7fT4P8Obhd_Ri5`T!Y{51P<_K#WH{0C_56Qp?qc|8rKN08G&baV*%pX_VI zRknP#_01^r4shH+&NriVyFqs$&mrsA5#uJxatkOg z0mFP+Fy-4E>CR;Yv8%A--iJQu_jNFPA^Qwk`1R}`vTstlM`)YgKyA@d{U~==(Pni6 zUp(>1y>!G=kWW1`L3IQ2IzhV2;oEV3Z-V!4p)P+Hbl=Us4a%QVEABxbZ-LwUK>wPe z-M9$7>H<1@sBZLKGeLMEQaC~#7{liT+`0l?d5GATQQ{{^f0&xoj<)(HG<_ai@6JbS z?>JJuh!sB?Rf;{@ul{snu$xQB9$A*CJ~ec@-9B7Ksr|H?$`$Tsl! z2pW4M-z&)PLga8Y=eMHEPlED(Jfl7Z#^0u{+)a!l;J*RRT|vAfU|2@I)n~%7aWm?Q z-ldzsa|CYfFOaOFmyu)IP+tP&?X)ji@pq&}YDfIu=${_Fsh3Q0c7)(Kau(&i8i`&< zt*|~FBjdyO_L>!<(x( zzY=UW^7|Te=zX+)4-$3_Eyj(M;39H2&kMFmE32b-py`(6~MbzX4(C>p>ouBCrFM5-c&whj9UteV7+(S2zx8^&Nv!TS+2gK*p zNnEun#;QJN`;M@NhpmG3XBJ}(HRdut7o+<};GgY{8flMw3rKD6t^k)U%RbuQbLkc5 z^FILnQ#R_J^~3xJp-*~(`z&bQox$0YU+t&0pYOy~f3C-X@SK9%@nmP;kq*ynW1JHh zM^C9Ys2X?)=nnGHFZ4)(`6O5`Bivs85FxMQy9eD`3qqeKdKOgc$ay&2nuZ+ph-?n- z?#QPjI64wbF1J9|eaLMBn8u;G9pL{EIH}FDwIjyzfie0It*yJj`DNsI0Vs}>_xaS3 z{akN{r^mpgC){Dq4l>fb2t7Sc_;KXC6C}&vRxccBFav!P(u z^S?Xo%49yHk?Sh>=jgvfk-Fj2oG5K$)SS+Yb=MW`sXRIc3hS|+zlXtbCn0;lq;&W2 zy_)*8m3u2We1MWo&UKD~r|LQYR>p>>wkk+`ESTYcw9dQs+8 zAlQOjFQk_{4Bf+&dkVPTP0iUt>+*TcYr`fTfBOrQ8CE}z|+f!6u6fwY^p8G5m9 z$_=J(OA=y|J8>jm7KfmZ9A>JxaILJm^1eYQf9)flv7=`B))Ri(12x;ZR-<#`B8LPULY5I;hw3Hga5Ew8wqH=xl+ymvlxw zvhSz^pZ@B4)9TAK6m5He-se{MHXezX$Lyj7>Qdw$^`P=1&q{p3L1#cLf$gL3->?_$ zi&p6=?NuQ>+SElpenjq2Zo zIclI-0D60sACz#FJi zSmWs$50#p!gM*H~qWr;0&r|6ePk-jnRIJ*H=$;((Nbzb@zr3qg zM2AAvM#Txwww`d>Ao#`CMj z88Lpn@Fh$k+s4b#6#74p1HX=1_&<-!O(kmLz%QEvO*QwI&Cw=$O&nDCRWXC^oP=|| z&NfCGb7^dD%)1TeebP*M%vZ&IY-Ugj8UBZ(WvN_3?sesk2n!1%8C#VSl-Gk%h$S{UVw z5YxvIrA$`|Rp^2Dq~TS(eJ=DSD9IV$QEtxiLT)DcO3zqw#q}lc2LgF72I&P~5`t7C zs>|rP&VvDKz9ZB4 z=4TnB9#m_Zp&`#GkvJ2Noe9^Ls0(G+T11WZO~cxU$1WCavw^HT_F=utHyFAfgElLP zJHa2h8XiE>^&=HEraP;cuC!|-5wcP@(rSQ60$D;B*NKW{kGEKMC1<&&?}5HO^2+z+ zHY21r-0RPEE7mtBv2HjJoZ<_bN=iD`kg#0WH^7xn{e*mr;2_G>uE1fMz+{>@4@5<9 z5n`!AC;XL4I}o@_h3k`QP~cPVv@6h##8qh}Jc9LB-@n_3m|_Z8fRIi`@HxLW|#SP;E^GH}dGqDz$GKAHtP(MhoFXSGchQ3y1Sq1=q$ZsEba{_*O$DI~X14 z2d!4D8=u1(vUWv-xV8@IE2M3fEp*fcd6)QSU9f#hwZt=J(wF?l7AX5*Yq9}5j^kKE z9A%~6x8d)BXRES-NL9}e-*0>_XothAax7t=A8)ACqG=wb)%I>ArJc$e|9TL1K?>S9 z=r`0Ge0s5^9$8*_SgS(5apkoV9nM<*Tr3P8#ai@1Y)RhCo7!(jj$7f>a(LK_80$If zjOEB>SfA`JtR4D+p*4h}j&&m65%8`LYx#QJv?ul~*7!S7rVVh^y(bofd?|N(!cBd{ zv;-;JnWzogE8DXBu@$MOyu-kv72un(x4kdZV__wfe6#Xw&`k&17XF7(g1K<@aS-n0 zY#{VIL*G$K3(iJj8?gloD=kHrfpRFRSAlFkrST2+$FT&_zH)N5fUw1IRzH^*Wd*-1 zX@Ia-6`H6>ob^L}vHfX_R$WPM zuOZDPr0~u1JHU83vbYs$?ohG}J<*bI8`^mjD0W~y>${+jf^Zl*R!7b6OnsO^S>A)K z^w&X^=g(o?{bR8HCH6zNz{O|LzRSqzzU)u2XuT2Che_ zooLG&;rZ9I?_m4;9jwA`0@V$)GFy@Jb}aheKprQcIE6Yi9374kPxLNITgz-)_jc65 zH=*xW6}7;3C~MJmA>4n6@_rQi`FD`lEpT=(c$cAZli}WKw00IbT@3HrLcbS0KL)l< z)PbdF#SPfPzZ;6XD8sc_qQ4WJyaCM_ML9N*?q0O%4bZ=jbkixjZ~9N|XzX=cDvu*^wu(@gm zfA6L?-Hx^$D&)GEG`4idr~w~G4xhmG{+npsF><^St~>!ZcS36l^0uWJP1&_Hor|_# zO*!Vk%lBu0edT1ZK zh8&-O_m_}j3-Z~HOxz*jE^722q1}I*{X3sOfZCHtH^X~g{RAd@Z$rN{TQVjjCA|(@ooBYF~&326Lqj1 zRGvi>u7lTm(3iEyQTxf0(D;1zzq97~*nCYsFK>~55z3ds8?85eJN_bS$YlD38OW>$ z?cM;+*HXG!V0ApU1)07Fu6+qTJ_!H2=H2qogX(!u{|1tI2+oW^(tSuh82)z#pLX2V zU-{=)qd6_tMbzC#!E>On?pqJW-IP;5!f%4FYd$>hm`}z&^T*_O4HBAO=>LAs?3>)v zWI};$1bmwU7pH?$%l0jxT0pGFK)Ih1y$Ao655vbdz=_M~1r`*Dv~6oa?dlCeM~zx5 zs(BvvyS>nnr_iVy(AI5GbC&=u$?u`wekc2WHa5RB?+3QI`LO)`;M~FFYRH%34_4xv70)3rt=b*^wJoG(EnNVPS(JDY=vSjdo4C?z)7lWWkew+_Z;*6(v3{hH zpDoN_c=8PVJBdus1KoM#xtBKh0i^!ztX+OhKDIKhvN-?WS>OD3XmKt?JJ+Fe(}=eV z&g><{eCRHNJLb@jujBt=d@6+WrnNEe#iVFmpzVfMIRe)H(Ptd(&3*D3P_)O6O|`4^ z@Vq}Y?TxgO*HQPjlk*0+vX8drM%tXt`P6)A-mY?O<&OM?>}T0m&`SM^w$U%lqGfZB ziyiP|0lG7daf5YHO>bR{>3k2pkOU&NYrtFRW< z(sg#E7dh&CZ2PhU`L5u*i?Bnq*nRVv`KbKs`NYbR{43dCQx{Jn`G?`|Lh!lI!4S%{ zAI|lJqpPSHBMM&Wu}~rZ@DZ2@0wo^1EDqqAWh7W9Q4%$w13Ec}^!=&HBanAz&?ip| zpU~Q)Zx0qdqrPGrK(`5;`;qny*|+dq>p=Z@l-A|zj7i=DmtPBS^&r|;v@YS7F&!Q` zh96HJy^7p@{u^6JO#4uGCg}^u-1kCn!I_|PY(I?O^YKVs2R}y=Iu71z=N#`Xu$A$- zYz1}U?!va&QIzf99_r>H+PXL5x%I=WKL1x*n%}22ya)O2hj*iiv5FSrD$duV54H;Z z2+`kU61?y!-lufmqVIkVwvVI1*AuSlUE7zdEnwOL7p5^9)Jx+WxHyiKJ}v4msrKp7 z*%s}&iq>4utb^pGkC46kUXWZxzxe@tqdtXq<8Q!Y?Vz_&cX!if?jg-viG7rNo50zF zlC33Q?R*_AO@W&&z}uZ1`XaSCd>zeAqlT@gHXSD4kHCQr;O+yy<+Q3xKtBqMAzx=N z^j_$TtZrunaDtS3$bTdGY(naL`RLbmJIL;%=HA8CLPmto5%T~c@8N$1xK_{#x?4zR z>gam%8cL}t++8dDL z9Jn!pdgycWKtAosyDfER9_i-7%eAz$_uw`8A!u!ZhZhomBJqatKO8Q~u}&}c9v!)I z=atTA#9nf}k{Wyv{`tJ-9$_oV{SqYU&OKYuw#%WQzvo57d63@oVa6nt{CYHQdNI14 z1kW#lJFDR37$h|xX;0(pnDGemxdEKFGt%fpZ}f4-8zbo#jvvrzg6M@=VRg=MUsx&Tr3ufxqc~(oCns1Ne5KB*V}O z-;Cf4h(~Yow5Qa6Z#Y`CkJ4WOzLlJBhLgJsjUP*_x6>E9PmaEXj$m)2kAE%vV1KE+ zedWr^^vXN3x_k>GiM?6B{M~5XdU&%HUVeredM$eYfxJiMU6s+a^D8Sa)qeojP$3tk`S9$)p2EMni{C;I?<#+Q* z`NPz(+mM29L1;yc#nhVRNcP_B7sR~`&2@|_KF2Z6A=^tgrz9(piXKrr;rq34$M)@7 z>e@AE_b5=h7u==j^cC>#en!aa(d0wWeJ_3CJZO#2KA#V)dsqGqQ1{I?dXYnY}+yu zJvxSl=|j5|+y~&4?e#A3Uyk102EJQ~*9V@TgS2;%(!Btu!^wBiTYoCoOMO6o5#(Z!2^MIm{t$8~8pFPMJ#&O0u3hJCj=Q6!rfB+Vq`hd|lo- z|9iZ9euG|YKJ?~L+b6-X!BC#c?>5Tr6TZ7{+Gn>Tjr~pc0-thrX3aV6@I}nY(;Y}g zqiMIGvAzM|T3Ur`;NmuTdN;h&k4=Ae^}_zWGo{kY&KX0;QX?tdVYKnB;OUVsVpM-U z`0u95&WgtP-agpzsyN$oH)a)V|GO02aM!3^ z@NggBgM9TdcKl|koRfB5-_eo2mFjiJBDbD!V=L|CQKUK%lszfiTCnv2(;iUy1h58V z{o$MQo3_Hv2Bmpl%kCkzs3pP4nDIB<`QFy#>!@)k+#JoXPYAvb#op2KsK3sRI0iBg z>9+!>W4tcpt%vBj@a<;w&i;QTXdU%Ur2Sd~-H}k$Z#ew6EOT&6L{T?VKGjcjG3PHn zXK7t+3GX{O8*^wr9mGnJD-VuRV*WEE;h3%t9_t(GjKBSBOWHc;W*v1;rKD}pu>l3H zn1_z_pZbDhsn*IrIprs2;DeWOW&XB7%HH(??*;OZg0pqbNBV@|ETUAUDbRvp0newH8^wbUeV5+xfUcH^GPcqoSt$7V(~;=OJi9?9qU{vbwU}E2p)?fN-5!3 z{V=6^U^>^Qm41}PRj2SsOYcWB&-m1Gt`=r+P~I3z93GKQlz|}-2Rh3WtFOjRv?Div z)erHhv7rs>MdE(S$y`jE(tCZnL#n|Ee?5=usNd2EK1y4O2X5n>dL+d1Q(RG|&;idA zs>tuu^o@szD6}qqBXyumB@bR!uTCT^wKNkgbw#RTtRsr&exg+Azc%gZuA~`dG|!T% zRBU^clfSkFFI93=7{~aL-V%zj3_Bf;Glfed$sx%lIFt~Xhn#q+BgHpfq)4rS>+j{~ zrRX%gZ%Q#rGvs2780&;wy)RE{kJ@Xgc=+qLj33C%;guYLPHdtJTEWjaS3Vwru{wtc zPa(#xK8zm};!f?|>X>IfH(ioz!WA4y6jRRW`poqHYwo5pTIQ&CqNy(btIhY-madvh zrDR?$s$YKkvYh4FFF%)GEye#iCHd9z?Y}FxUw3(>x*RRcFOy5JzU0*us?UF!D4S?C zaiECsE4j~Qt}e7Io^^ej%P;F<;)fo>%w03O`1=nliO;*khx^CeNQCL(v`jJM=8^$c2shm0S zDbdozY-5a*oTG@;u2Z|_A5W8EzwGJeO|F zP=yr3w?Vp=%eN!=PZl5mcm7#h8RCmSxO~l zXz)?~m`B`Cl#OL*a3Sc$d`^m?&RS~>^%G?bD#lM~%DkgwQVd=Pl8{m0iabrJZ$N0a zccdQlj1dUTH!wtb=Lf|nwRGFziV zL)1boFulBp*5?M$>Zg{O%L+XD@eC1ihOZpill=Hr|vO_iyT;o_3-Nl;k z5mvbOv2HtomB+A+Zpl4uRkt&?+>M(MaRnsh0Zu*7Fb zb-QD|H+^r`Z6~pUyoZ(Zn^-G<6YKR?vr<2Y74pHXL5@HNeWPzzR`t6R+dW5;HDPc@ z?Thh)V;oV~jwok6i0lp2Om~ox_o0)PNkp|r3RAi*$W1HN*~JR}V_3XAfSpFzjV{Gj zU?%9?)nhzd>;x`-v1Wp?8*;L}h?;B5s|T4H+KkxloZ(I-+U1JLRrWxwosc%5az$BI zql+3*9j4A|ak&+APht;on6q728~85oEy#H`x@XOu%*yv9^lv&jO~WE$Fj^`HV~sV& zOl>LYK;n%e<}}J-9Ug#o_(tQwg-&Z%F%UiWjkUH2VbxypPOS0u*K=!LTkJL7#%lZ~ zECqarvXgh zzWKT%c(1?)L%YtoV3-AudZ4dL&DK0xHQ#2x0PFjG*dDB+l$(jUs3_k6>Xep&UBKr% zZ3lo{EwP}2%$C6qT){`J(1J^U^~!Xwdu3FNB1kG785 z*bFD%A)wP9SN|oot`~XFqNH0WMIY{8j&;g0^ko#<=$l$Q6K@iwnhcH6(C~fbKC>i? zY^dK7s1sPh{27*vSE94M(1)RDfbTb6N=}_gp>B@^vEx`Vg*-** z=+++0XMk1M{Akmt)zB2QdLHSvg5yeXAA*xBpfZGbqv7B6gwG_e?u5h+Jz4|%{?*Ck zGYkwp$ayBzw7788@&7VUhre-$Lt z7i@3EmS!cqn*}y^neiRVT0!4RD|n2&)_|}tI3^WVMEi(01iC}e`0hn7(i*PJhQkvn z&0=^p8P4fhwu1a$z}D$1N<5TO`J}Xl7QY*@RuO9|v9#~7C$)7_50!3R!Ao1tHt5&; z(C4k-(hAGHwU&ZU4~7S@;(G>ru=9yCpK^DC2UlU!`etmNW}@G|*IQd>Ek$RbmF_gP zggEPHn{G!3SAs%o`Pz@2xH15Uy?eY-xwXKdm3#@kpEg z)ksb|!4pVf44k?Io@zs`O=LgHvXb<(XnRCwFYo*L9bMR7*=Gj-d^#Quj_1&NEywL& z=fb@W@b54!^-r-C?U@hEyI?W;4sx6W{eEyq-wHhg^~z~qY{Z8S6$XQd?WK zv+oH;J$pJ+XSNZG=g|Z5^oJoY z?cBBIw11pNNo}h-LECX}EAUx+E3|GOppDx?yS4(|nFRv<7mng{@hxndhgO!=9jiN3 z*^_U_2k9B8_*VAGlx;HQ*g_gdhK^U=^=~5|_tUVS@J$5n5!Q!$gZSt(?&!iDBxZo^ zYBc0L%HZ3yhrt&u_tjHJ$M$*lZ?<3Z?^ZBvK>y}|Zvi!B1G2jTZ?N<8t(6PwURQT< zWlnw$wd5VN6w}GwexeI!GpS8;s3$AIGo16Sw0pY+e5?F3`KK#C zsC*AA*Xenu{2tKnBlgaMH;!EFO|`t$s(&VVZRDyibe4j5Y{8v#pynQHwl3}gX$zyL zgIX((yCAtK$YBNJosm$S4qhooU)P!(pTd$_&yRCSqaVBbD(Lz5EIs?+{K5Q%d0o{3{Jv`Sp z$T6DQyr10G@Oufg^ind#81UtMJEGNh)BfpiqW&q9rEqW!2;5!k@3V3E=lvW0v0sC4 zdKJz`hX?RG1ZiGTwA0}M>=TJ2SNBcPyK)jJ?1y|dQ2R!}sY%Gz8q$}%7QpQjaN)M> zX>@TjV}|wU+G0L4;P_Cm`5dpc=rQoQ|HE1&JQMu-uez6wTKF^`j^D-q@-x(zlSP|% z0ln7dq84_9?md*nmftaRKQK=R-CX#%5=rj_#V9^g(6nB}h~0Y5;f{Sq2g;+L?J=b1 z=wTmis$LsA$>k!n*7n63T!$PUr@wckyBG;i=QD${X-lq!|HIt99}9LpzYmaF|AviF zoys%R5PWAI?ujI+LnTu%3MSF&`jTy~f5P1^0;1=VK$~(}zS{J4v~Y7#kTC?&AIp zl=}iaKbU9jzecVym24Fb@0XgFgGCW zuQ0BA6k3PTH|yF*(@+worg3?t9I+}TF! z@h8~A>y5t$uG`CHltVjq`_B=?z5@+#Peu1Hol8BjwV2EGPUPY)R>KMjx_8NpkF#-&K^FYLEU^^3NVxuRT4w#v$RiBePYUxnI(FD6c{q?zgJ%;$BLu=YXD2 zJ1ET!)aG{~=S$F`Z)dCVO!)-7{47#hj&ICp!k1Hq3*qit8R36F8;{q_&q%oojy+b? zLi2Rr&O5>VQTA^8IMO^9+>`ltEN%Tz!m)ZX8m=BiCd=s^W`WH5Z(CKW zO-S=vbm7753x%J;LNvQ0oL>V^e>2;h|1&IEr?3Yp=SHlIaWY z(~(dA6R2m$sEfPM`b`D*j&lEf=$`LFs8DYm!TE%`137;;8-h1vyWD-sW>LmH_>}g~ zpU3Y94D4!EGVv>wt6L47SDZ^w7-3R>I_lq_ZpoP+8F7a!-j>7z*C1pM1Y%DX7_Rrry; zkMlc_|51L|!S!3SnuET3iQN%uP&uVIt%DC zu(Jhfh4Vo^uljY4$ohhdAFg6-`{mcM+cjgv- zJ%PrYYmc}Vcc6-R!JEXVz#4N8fi328EJb;wlG`S@qqX|soTYC#amVKo$lkWhH#k_o zq8$!yBwW?>O3IN>&`%srVS&-~K~b)HFG`eXcrE^P#H_R!ow;z{*>tX9`1~DpF|Z_U zN%72O8|!Qi)0W%`w1F~8A4ns&)6=WQ7iEuhQ3FjKxx~HG(<$#nOw&bb z??f)4jd7mBB3;TkuBv}S2dX7%O6!GbJWmo)J74+;T*gbhlY+9WE5aJ04vGnLT$R76 zT&6aC;IVd>+7tPdY0mCU%7H656|@ujKXyQPO6k9b^~ocRUrHX9Day0*Zta=5G#)F_jeqA<(u|HF)oDr^apzQuwIS7MYtb}zRhzbq zQRZq`JUgf7$do4GDN}jo-P&CC%n#v}A0-Dy0xMDI+oBJcP;bG(w8@zPaFDbHe7GD;H9 z9!k0KBc)Gy#FXekC|}@J2O2*b3$G4~G6lWHDV!Qwon9R3mfUl;Jy7`{bvr`DXBu_D)o6w-B|^2#$y8+F0eO5YIZx|r{G8^~uc->^^jZKT?> z$2x7SWhRR8Z)v2L_!sm`Y}K&?dv!>8?mZ>xZ(vp1^ipvpxd-dGQ&?a24f3m52cFN> zNYJ+9+kyL90Er}7_Xph)McIx^+gw7Y%lysNl9Xwx?**pv*HvCQ=MF>tSPR|A>hd1e z>36e^zP`XXm$hYgf9Xj)?IE=oRnj3vX(tIpJ}F!jS^|X51UEv9lKg9NrK^P8f92sS zb2sceX0xj8E({l79d!a*gF~!=uK}Ufk=nnEC%1O+uNV2c-tG~(hCazTSL4+Lx$Bi$ ztR-rcMmd?A{6DoGm-OOH-mCA;$x-W|HSp{xmJOF+DR3N{7~gukADqu%$Nhe+YmS1^ zy;fdb8B2$4*hRd6Rmm68u~O)m~>n0EHoAoo^NuFBDTyxW%5>AvW(cJ1wp@T6yfF|I;iB9(QpEvxX_ z(D?@ZWz<wM$uDlAkMvo61kb@|uxuJ2CRF8u+PS5IQwbw1_y&Ft=D6Ro8Ai!$n( zR#dJ=ht`E;z4pE7;iDJaOZfbcPv~>hZOhgY%ZeS;){UUk_DVb};N*^?4zHwbaj%6- zu|?{Tcg??pRm_#x!@LgbIW4ZWj?nu+$=WX34rraCm!CF}e&nHdJ0d5&OEbRunA(m< z4H21~j}}G#S$F)kUYv)uk$W6zm!gHXwu$TcZsKfhwt#kUK4))2zRmF!co^h6ko;ci zvh~Avkh;&6HPv(LW=piwx5DZ#oN7=@cpa?`*JX6}6Agc6jMWpM#6AIm`08+&9AO5waC7`!;U(<(f*~>R$Mp>95vl$1O9Xs4?^s&@5yewBeReLUCEfuq^iRrW<&)HPdrN?@)c3J=42FO{gPof9~4;Sd)~!+S#7_ z_WFG&jYwmp620wOYy&T(X1M>37JswR!!5}2c51-gSfQPujXnW?eed}StlHcy=nJ&m z+U#y1)EyMUvcD}BE^Fww?FIV~>YHLaFt!k*wZ@w0n5ZK$#_-YCIs67}3$0t)A?tOj z{iH2c9dQm}owgTD3*ep>mwGO(ho=wIW4w{i+p*HS0g3L%((WAi{0uGNPv}eZfxMY= zOa_O3aN0NPXQ-uI4`L1h=`c`gBdA}6HCrDwKME)1=X2a}>D_pNJD zYsS!4Tu)w0(f8TZ563LZ_a<7ocNJEJp8$(?b$WPBK}U~a$@#}{@jcY%d-&MbjVml| z=Tgp1Aeuvaw5X7DGxF*{>*d~sj(F<1I~-I4C_^6*s0-@vAo5m&)mwW-+nLb5bFk98 zmi|KT1?%oCT4%=x55lL1@d5d3H13PoxZXrBya_7qLv|Z>h(E+~_d|tmfj(HH3N7nK zDR(1-nNXNUNhW}%4Ph(D+0lXH0&Ug$K}D-jdxQRkyeAM}?@Vix){fQ>E$8Y%bpu-O z{$$fBfqUPqg8Clx>^b~JTd-@&Pq4rJ3fg#(_E65~-*p*0{Yv%?TI}n|Tgi4O$H%EP z%Rw^|Zp}bX#zU_^5_4x2eOkoku7o?#=c~|-tBK*8skIH9j?Rzco;6(08~sykX%=E{ zd6KqyF{PdYz3E7C9b9-2`Mopik{`|g173X#ULL15xa-G6P_BmK_rcwdl6D%HeHZ47+#m_7NrXMvtA-NB|+U>oA?p=9nx)4ixsfiHu? zNqDdaD)Zp_IsCf2+yVTfK7wWPpJj{aFFsGZ`~*_ni#+r$6ra8`uZ3f~DD&DvyRM?$ z`B!Ypedo45eygwy-2tC&1;ISRI-{pELDCKlxsWmqM(d|?Zwx#fN_pK)${hjSXGi}J z`$YW`-B02`k?U;kFMzHi823qe6dn96wC_{JxKE4LwIJIGm$mC|1H!$~cW)s3t1o3U z@{eMD{9aN#MDE+ERiC3)-A2uChg|fG8CTSe)o|oNICudqR~=fY-sqXosnFRO#9ItL zM;R?B*M*eu0QoMX%(Fnh3i-Ipf$uY43(DU>43pvJii!!|Ja2KLoOO+P{@ zb`Um=er8nuG}^P0x-kpH^EtETu7FRs@V%OTs2~5AQn~~1VLc&obU!@aM&A3;bG>6f zoQ=%)7yJu^=g@Y%hSF|^H%p*;E_9b7zt%{xKk{~0wVkBa+SYNFW1Qt+dOhDMq|n=G z1={so_OASJJ{~T91DWYBc0Kz~xi?TZY{ze>zWO%*P1KUNpq1xB{SMOUCA*E>ZecgO zH_)y=%}!7o^PS++hS^a>d(zxhAm2g$`iIV-RD;1blyvUl5%!~cu&gKUB0|>nt z3D>V6uN%ngwT!Ee^FIE+%GThq^bK0@gK+*Xo(KL8{@;s5{4r2mhc;#T`;qG{*yi6! z9#@0#>1-bR41AZ`_?_%db}d|8@J}y8ZPUjP`*QTq{lWBT8bL{CQ45BF(_J8ymR8ba--uP0Z%m+vOe;Yi>BagMU1-*fqx%ACrd=Kr4kAG|M~$UjfJ_zXO|n)~z7 z?w0v`=oj@ey^mJ(KNu1HA$`$D(cRZ%t@CmC0nRA+=Q!6|+#LxXMuvOPMV}-WgH2?5 zr`hJL;_Hre?l(9W-E$u`eYf46Wj0rn>0hov_ur0x$3JHGdFDKi@6vZt+0=13UnqL-+2_X6L_|KU!&B*^vE9 z)+c`e33ddZBOX0`;z_FqHE0Vw)C+w(dbXH4;(naIxnKnN^!%R9??!6aUbuHV-?`jb zLEK}sKEI1x53pa?kMfi3SMtOBeR#e0%imV)eD`HWwcYU5eH5vFnfBpx_^bR){@08@ zrs20TrgA5{mkh3)L>tGU3%>s*##gq0VJYtJuMHt1e2=m38f58M*71EDj`71^)VE{|qkf)8`{R-A zQSSt21}0Ls^cr!;y3O$NFqquK=~AStf0n)&&RF=Y>;577cDJKFc?uoT*K7?v!Fw5B z{Au>*_&@)MTKWC#(tLg8L}hs8|Kz`)zbF56emq}HOSdoob4sd**9(+ddv{SJ-o<~+ zCaH~%e_{^DcS@X#tleApJT&VvczS^H=?Ca@l~1caVOcWY&Y%VRBxHF9{N0cZM9U6Q zqaLCUUkm0Z@viBb-wf{_#s}yk{JqY@>+7quDOXdvz0{Fs(H@_d%)J&}@G)|91Zj&C z&&rO~^cf!uKPS>w>zS>mg)xFog~aikn%?m;ky?BYJ<>K}>!ZIGt}Z}lE~QPmo!as- zMw35fuH@0ex5s^%wo)%%M|xN-Wm*;IPlFQJ$b6#o(jfU<2p@kI<)IiDt}Uq+)$kuYFFkF6vKu z4mEi@sP`cm-+rRMi*lqlGK5BEcKB<~wR#v^~{WYSzwnO-W9&x-_$zlJ$S{Dje~cv3cP zXsEwYbAwJpnvup|XLF;65|0{VdmeA8v>Z`JW1jv<_ac|{E7wCS#1i#1X0*MQuZe#_ zIbx+SwL!io-doFquW6Rt`ftu>#ul^_)s!OmBfZEa+7!_$?bB&;rOSPJU=hY~JPJ}ePS4-t|SLU5428D#hJ4#B8Pj%Rs z)d*6`s85kc$f3NGa!w&Z-`^CHVx0OPG5l?qYbtF*Tb=e)xu_{F<%Ui+JJPB?-~iJoy&A6^|+OXJk5+PLMFXMwbqzfvgg zG)7MjhTf*yZ>*@-XHI)6wKAl{lES3e*pXs3b}!{`3B;M?o#LAI%;{6i)BnGE8r2Bm z{7f7_J(4rssm|j}@y>iN_;=NDi4YF#A(y2yz>b>faGo6>E zJyWb#a_^T>)>0DH)T+;439_a%O&n#CJ&gd)j53X48jl^% zCnbe4R-BdBWt{4>K=w-TRZ}*7<21}GJs+lg%V(X2X{xW&dHPSOtN+V;<(c~=&rEmb^oeqHzKy4; zj#1{2&eGkKV+ud>f0|)CbFA9CF;h^@^^|vYya-FBOLx+B=wFJH?w}RK$%mq;Bj!Zq@2zSPo)lAB8m{vOV6v6zP5y^B#D~5N=L#Z zJX&o=Nrmg`w>;F zu$GcYJcv9}P9Z_dk)-Y1e_G>-)JalNE>k925mJjZYHp;_8q~FZZBII~O52U~QSEbG zFVluqF9h?Bx|HZv(+hmAfNJ3y8t2NEYer>{sl?+*|Ftv%bK;{Mj}@MHbGepgLyHx? znOI3^mlL*y+Ofwvh;_4Z%H|W)l%V;z?(PbA*yPHoXd9||R*NpKyzj@dn8p%!C@dr< zvjXZn`}9lM!;1L5td5?<_o@h!=BVyF2H@O>ul zTj^E#P(CUDENk}D$i>oyzekdg_|z`ZX}uS9J#;U;k>rFvL6$659K)9|YIa}>X~b{j zky9t!ZN0js{)?w{F9hcK{YY3oH97oh!HH&9E_DTddLt-_^jx!O$~9 zqOQgpSFL+`{Dm)#Z&y{~N;v9?>!PKMZJ#`Xla@{kR&5S_`@L^=T|vF|y`_7x@jHqR zzJc2DNIt1@UuA#goB8bg%Y;oY>WOwy+BigS9l9L-kl2-Mv@7bIUIMoKU1%f1GSV`r z$=U#z%09r~D6Q0_QlVB%qlKFT=av`fd_(D4xVQ-|ycj*a1v|$Ld8P75<=M)-$~C;B z_&#iYCcqOZsPpbDqD8v?C6R0R(wJj-7l$WxSjkqK}N8KJI~ZEjF#q@{i@0RK8Ssyz+y5LjD$Pt5y}7t`D5MMn(T( z`Sm8#-bUM2E#ty-(43<_xmSd?a=v?1e}G6MK5KgPUj;yD+1BbkFdzOcF6{i=wcuQ& ztF`lX(4CKreNX*ovX%LNQg^-cA*c$T(T@h_*8ow&{0 z7B*^69dWvorVDqXEmX?J(R;}>;U{Gc(zns11@5cV|CV1YT^lFrQ9rVBwFy2`&f5fW&omym@;}IxrTeQ5^D&LY> z0_E#CM888*dYDp=hly}(0bk#KK85yW3EHQ0cVLHfB?zwq$u(enF6)Wa+g5QBUBrrrPpo-Ru}i9dr*BZU17NX{n4G zZK&_@_3in-Cs%9E`PAMu*hy^$-$|^zPJr!ZH1H_@AIp~Jf14kvTvz!=>cZOA1#CRxixXx@vHxeJI=Z5K|ikh(z|x2C64-QO=(UH zy&#srLA_^YqJ^u9{%tGTcPkc=z7hKh_;?YYTj@y-<$sAK>sBPZS!rynF8@m&lKNh5IX@%z%_>{19z6bR5N0>+X!vcE& zcsGFXLL_=G64t}aH)tOLqdo>-%BJKe@`tb=)f#p{KqB{Huecd&)JKucdE9x3d$$(+3wzntlzI^emm}u|AawTUf^C`7f$xER&44L*L z-E4jbAomsYQ~EjWhGV{Ce?FzrZ{SI=UQe0-p8sUAY5zC^`g*MC->8q&2(;9vkXZ!} z^)zr_y&XmWvXeG@9aw#9|8k@_3wgYieAi}g%%7;Vs%*u(0S3&-G}SmRcVEfP_xP%NXq)_sNTLh{KU0f)`ry`1V?ag>*XGSzIh-=h;?70r*#qq4)GvQ6Eze(6eo{BIzF!?EX@9AC!EUnpnhe3b%x0Jf$-b(HcGMs-kd=H#hM?Kxh6;H^8->s*g-{e$_w$~W?h^Q-bP_^yns?5upN@~OJd*WcHy()`h8l?r~H z`FHZ;poq17rT=@4faWu_v2gBtJ1aZPbf1DXy4mFBhnw$f)}?YE^=x(wi+;s{X$Ee$D=^{>}B@t^fV{LG|yew9a?a&p*$Y+&4DpKkd^} zc_>}KjvQkQZVTi1%yFhOMg76+E^qPaQSjBLMa!fwtQ6Z)TfRn%q4W>qMR78J1fQ%A zSKgc-$=2uJsr(E-qhD11sjg#vkNTr^e_r{k$_MLyT3MU>EcSeM2|i|OmrvE!2J1&U z9CgR|QBFFhQ5$^@^DLhJ3^jFnUi7&kT7bamyjFYk^l8Sh7ZqcGU3ivtVb`3)iB}GZ!6P9IbY3fE?*-bx;5ggo zK@kSlNEhfVL$n2Y;&(>&x8fD^5Sp-vQTOW^L)=eY{vhA}JlFpaf1iKJ4&-;>8MhTg zzG>mB_^Vt1Hpi9?oQQmkU6at-a@6V$3@jC!7B{IHI=BeH>L-4|(<-19h|@6D&NBk~b@=mvQBFI^rzCt=0CIzU#&D-fo`% zZf1mV9nblf(z~8RAJqqKFqU;9(w4N$*PP=h??!$p?{p=n0;&8pG$E8mqEgZ_MBVV& z+PN0nQ=f``XNb?9KJ&)Rg{3a_`b9W|1-^7$OWCVHl<=t?<{9!gG`{6;iPCspnX|P$ z^vX8h9bKFibi5ROfib0F9Mhza;SGiPgEtKW5 z?Py(K7G2_y;eJc$NaLlBh*OQg^wO+NU3(QWHC22(3nXzCbt-5WV(R!eUMhQ_dnt@5 zeQ-MH1jW#dhR7^&#ERBBr7+hhS@5t-m%?pZ%W_D^7gf>z|h&eS*#d4TF z5C_u4r*suG&BKvEDQ_+s&}}>q<280PjNLfO2#q>Yo?BONt)EmDaGh|9Ow9C2>d@Zld>D3*tJiUBMwI+CGYiEgGJ*w(5oejd%#W6fIQiM@^ zg7)cBROg(|)snM0&MsC-DdPLvxTcmde|cwTTb9W2R9r8&jMd0rUf$IyPj~)m(7b$_ z{}s8N4nc=;Tt9c`tNT=oB%X`Y(?<&5jg)TbNY){ds) zX(Gi9WVJCGqOHAK=9ki@`;Ff@-QAQ+x}UCnj(Dl^*QRNBReN31ZHO=Kq}b`dHuR;C zl+@4utZ^JKm0o;-@pMREibC8s_mV=KH@tpn37nc&NyB?(9<|}8rmGEU7`yh`s|cwr zMJ9E#t@sHCv;Ut*0MOqr;~Ok`XK|GZI7T*oYwrr-4tYO%jrC#>pPIn75No^&DB_rIe18r4d%*t|z$e z6mkeW5vy!>B2B^)|FvPUCLkrxl&Gr}B8ycXXNrR-B^7bhQjwo=gcQ715l4&ei~9vbmWzM)aun83gnq(7q1p6A)g>u%m0)`gx0PhTVhu?UFk`s3hdqm7hL-uKv-IB(Xz5WALpU>QdEcKml9PLqSZDrtWiV4ktWKQXqmS9 z+-nahhmKfc5ooy@tL@TuPZH@X;}j!ON-gCSYiWjuj#w7?r@fc;BkD_P5h9*(YZ296 z87tYdNln@MiP{_d?96&-KUN=Idz8o36w98Ztm&37tiMBJEuR)ZTFbisO6(^gl{BLt zDEu{T?IEfZ8gzYYsCF9N`0t8U&d_WiYqld;b?yMdvW^Cw$VZKhnyS4*=v{a?c&^`q zYnWPkC=aa z$#2i1H449(upc*HYlN+pwM}aaHB;$?&wzFd#&DBla2O8q_QduhO|6^FA#}p`*2nHi+Qh|sU~OB}marfRdz6r}=e7@OX3z?0 zHT%v{Dam z9_vNQ-~W)ac*BdvTvM$HE~yQ*hgzy#nATvf!uLbhe7pW+teWOwcd-}l{XN$0&DdMx zF0S39N#7W`AReVAZNEttO4pWD{FWp<0JQSaBHA4*d@p`S?n%WsN?!h%qWUl&%jFyD zwVBq6dMMnR!gnV47GOj75LSo_@sJwKJBe>4tb0K-^(XR99TKy;DIR&Kq}y=MP!Sl< zaIJZJ75vj)F7(s2e`8A1GRC0}Ny|80u-hF{)C%#%8??2_pNoyn)yQ{oertXqmWaQF zy!#gNvwf7JG^6~skd7?0)zda2JOIM(KrTg_8oiWt*ph^2hMbQ6!O-%E&>V@m_eyu# zxB=7w?X9gFdJRm+!e$#5u$}WO^ZT(cY?nU`PnE2kvOd`E)`7|Kiv3{ttf+tSj#FhO zhqb}iZz1;liQd3iZMmzjr}`PPuErNFmAk)asn!eL3`e%pknRkA$78Lc#otBjbFh&2 zazB=@%fE-cvvxmPJmhdnyEr)!Er__?g(P;=Fosf!U*qZ7U>e(Rt&J>^y{R4+`nPD| zQ6BP7dh%~z(MR;5WtoOm)C4}Uhnb!s3$d>GIo=CToNU9j+3Ul{_v^cVg*u=Qj{ART7ul5*q@pKDSMMjPq=fC z?u4R-%mM9e@a)6pYXImT;w|^Ti-omz@ZtBN&gp~H1u7l*?F`NKocG}4J~-iB=6)pJ zv37Qb6W*(*KK3lMzCFK=OWgIv{xcm>`}K9PKkH8&8AaXjz0bb&c^nw0g4LbF)?=IW zng83~nE=^Uop=7-_gak*5?bhP?P_&vL8x0BAR&PS5{MmPY=g`eNWeDB5Mvf^I5EaH znRvr^i6@DH*d&g}&fs!lr;JVFIEj;r6DrA6ja@a9itU7PrD7*kk*$(YW#{*MpHH3p z)_yH@3uWfrSKW6z_niOvf8SZ|z2A4g%6csK&gXmB?*zM%W%S`#2RnLn=nb)fI?e&( zQh12IIL11)!M;_F)KX*4h;b$QZd{k;l(drC?L>Nx&S~*#HPnc7dWdLOYHbxQHK|g(vQTPgM z-QR$XkS6G_l>&ssu`#^TrvwlTQS=Gi{b@84g@*tU-v z>X&dkw)o?9ZZ4(gc|G-dCFqTVwUevYQL>hEE6L$B_VgAXg(JO<*3@;b;M`_veIYeI z9WRSB$nOkV;9+8XeTXt$Vk}Z?w2iX2SHtzV5utuC>f%n&oyE_{71|bVK^red8#j@9 z8O&aQAHa7x=KSU%_HLvtt>=uR*k-id)+vhv)YT|Hw!Tdqv$WIM+D6Z%hss|NU*a=b zeHSmna#-!iqB^V&+EeI#(HVNl(dSJ%h|uZ@*eym zeg%KW_uwV+K`amd2Qf{qf@!5-doi^UnYM#!1B|IXLtMF+RBxpxeFzKLFIS$!HuYT9UR!yrm|q<& z_Ea7zt}b>Hsp227qyHvf+aE{b&M{6$mPSI@L7v8BS_#VS=)+ryMe-t1VP3?(_h9jU zVjJu$h!;}*Y_TEW@jvDZ&TMTXqOuQjTy?Iy9tL+XcI;wn3v!mrU6i>TsV|~7W3Y4% zHU50{LFNssYnRu)R(ua{mgg&PE0*DXb~m}HV{732Wd63*$MSQ)HiS>{XR$!O2k(N# z#k0kO#TSe9wQu23@JMxep-r_*%-PgKNA45I&9-8{x|kg1fJr~B@NISOd?9UC`^UNU zne=?{8Hw#Dia)E)t!}9NTlJCJm#P;Ohv^&l7q2dEDE<-MdL1^~ABC|Abax?~Z7W_= ze3IVjQD$tvL=1*sD_$)Ay87CI56t<{zz-|`R{T%wxktgK?$zoj{ioLH#~I_ZvY1!k6Z9^z-ADgPdJh{bBLW>V37354?Bq8-tJ6{(I#G zyj7luiI}ft_M)!|PszA*8LI1ycML{M+6v08U{kWI(8y7cI#OKCTvfflv+~`ds9sTw z6!#C@G&nlvFRQ;>eYU!}`uo+>YX7%-Q}HTV%@-;^z_;P{;`_zv)!U0_E4Nm^TRV5) z^8@dnb71hi6W%@N`<0(7{*D=({oHD}D}F>>tkpsNQJfR0mkYqK7B(HTMraTA+>T^^ z9bWGr?*ztaIgV!Dd%l3?oLT$~cK*LTFf_Pw;F;=A)V^LD9=Nae&EkEPr?G6m4O@I8 zq+DEmy7t$#f3L2uJvs2;;NBCK489i+ioyjL-Q$ z1DodjVBi4$NH-9f#_uOAtDaQ*$AM1`e6YH!x_IESIgbr|r#etvRoQ?a)#KFDXp7FK z8<)hQv)i~AakgZOiFvX8kX!cV+ADk4;mj}Q7WVSiX$A4Z{)zd+ncOX}E1oG1Rqv|a zQvJi?b9iq(QLL_B%gpSH9DRN5E7cQ<-z&aV+gyP`^NHGT{$FXC77OVd|>5=wSK8L5)!+1tLj{ngMNbqS!*vE)= zbWQaI;*h?x_^aZdk;%Jh&tGTu;{8Rs6K+K2UScrH&s@1LYohTR8nT*~IPQ5zB73HB zdhECJ`*C{B^XPkC!+q)A;yHS{$>JYq!=D1pCyGDk9%~bJ_&nH zZegT-C9|&^n15XdyEo!%@m25{W62mGwzQ~eGp;fljpvkF@EO;Jq-87+CUjyqMJ)CJ z(L0?1ihJswA3x13>~{QBe~SH|Li!KD<+-$b;|)5eiIST8YM=$^Vdd;BdR#?gFuWW!#a6?420} z@98O|b2|GIbuW$uy3p<3P_;6buPdVpypePKFUvi8&d>>a(5RPM8?u#AbBh$YgkHAV z*QOor&_B>6q4zy9e|4kvzU@tM)3!cU_kykpFQKb^nnY0esD%n5~63ucmoqMr% zgz{O3EJphJi7EeO967hfi&7blM`O=(mZwgM&vk4ElOpjidRMW9{1a z)4E-n-j12W(dW`zS-qudMB|*Z^**R`PI)%nDNS>V#_~E{RX*N$#q{u*PeRa`Pq~Dd z+>WOZxzt|r7Fj~sn7;kd#x&jcnrq#>-|p(hGzmlF(R5upoA#%_F6V1*=_yWYPUV!X zXWJbyzv*AqE~k8~bAC};yAs=7qb4`EwRY)p$uqH4MrUc=N}LIF?eZ^UXswfHjCI!3oS!I~7muZqThEqLCmwPtpH}-k zJ#gf-InVSIuT!6ru1QyzKIiYz_)T@a`S{GdQHNwN{55=D9U7FLak`hEGUhTfqO_HG zwl_$*cx<{)=Y7^{?NSXbQ#(T6`-By(h_tBHCNt&U=~-DL;%L48kKChfp*W(jRN>_vwLSxi$YioT|s!k{)MardhoiU_y<(yLP>0DlyQ>9wxdW+Me zA)kb@eQu38Nqm7iYy{lvzL;}fd|R%jcr29} zQqO7HZ>2V2{o=h7iN{mUzC%jaw9nC_Wo)q@+{gbo(wxSl&5Jslkcq;dEiAQ) zXldFpn9esyS}XY$u6ykp*(;YC^-R)kmP7eEbt2xF+6wFm^o9_d+vebTy%n^Q_AR@X zIKCGaXLo#MMCI+!5s!rSSnKkV@P*7G3WI$4)8f|m(OSm(o?O1>vb795y$!7WcCuEo z!E(~ZUZm8BAbO(IBZf{!ia_Uo--U1lE>KyXR%R#j(?Wa{J1A|r{L?acCHp7Wt<3y( z`pUZ9R%l=7k>xh)P$O1TByCehTJFbJQ`H`IN?nq(_(j{&$3oWRO?^8F9Den9JF?u) zdIF2p4cP9C^3$qu99zQEux%YB_ZYJxH?hfR=Oy}hukmjl*K19$eOPLXz9-G?I`D<{ zj_AX8LM!4;;MxT`qbF#=e->7@6Cl!NZ!ui@Uk_3(bIeiA4lU}$pxAxuZaG@lg-3!` zboy(UmsX&*$)ts{ZE9`2ZME7!X<4R4{saiO)N$>`x@0d_jAERm2HLg`Az!6#>(!=D zd`XMiF1bzUp)rD7wehpA>bP-OtY!EGN3NmMYMva1 z=a;tZLv{P6y>;!|3@+_()__qPckM4vsk5uC_Xe(txoG0X*_=$ZTb%!SR__Qc(zWxq zj}oa8(VH!pioVPm83#qq{C=o@H_O;Qxz5c5_G_oZ)Ee?xN)B4$oQn0(dUDi@KrbLu zYXhkr&ss|J3%<@i8++M=)jFu5+DV3&-cr`aShIIf#@fN#+p9IvmuR=YisSQXj~Bs_ z8nKi%aS=$)q_k6zt}#LMIavxmqY3D}uoNaI`HJxQd}nzhJ&|7yUPfL@qVs5?tZI~? zPU(ZQobt5F536#&OCG(QqpI<`^t*A4(h~FlCEG67)!XYLEDm?kkDNdrw$)*_*Me^x zrWUca1iciu{!U};E3rxBJxaT>w`jLSYm7EH#C0pkSI;1QF{8~na)w=vtxY{~-=0>F zAjh*MSoZCu#8Y8rJ{oum?bgtTCdy=F!ItUW=yCG^RN zA_H~)B7(>gcv(fsj#Ktn1(&8iT%WRqU^4>7^14(v;$QMndb%56)#w9vV?%i%k{$%3 z?PC|)`$2srea#ujcZ9E1zX=b9!yGrx#hKU&uIj*FjDd~=TAmHn>*AQ?IHmVXj1TsG zYN;~N>uM7g;ZITCA#&f072O^9X+DH);!W&5PV9%PV8|E=SCQgYa@_-?dtuCP>py^v zr7^00i|Zc7{_b|RKTPe~^Orc49sQR?6hycP;hU0?Lco(wFEE++Kif<-_FvAeNp#gH7${ zKpcLr2~T+)+}OX_2DHSsA6Nm781w9ZY`wM<(ds+$%audLw<~{-wdRNMQMr^ap8pFz z6Z4C=;IS}>b^JPxeU!3ai>6&kPFIoYO6)3stf&=#j4kq)@d^8Ryazsw#pj~p8P0dY zm&$gevvgajaxe;qes*f6H4tC;Wa#aBb!7~@`m>A8#T8(EH8Ci@M1+NN_$vBAB56E@ z73yi&cJ4%`uR!aJE4LH*UtXD8Tv^O3ZY5sGb;Z3zNSlLi=aS+BXn?(T`$g@KxrF1e zcHz!R94Ev2(Q~1FGQ5aZu>_X>q_Q8+$4?Z$QQU&%?x*-WAMc5q_}cg)5bUR=i0qA6 zL4Tev_df!AmtckZNxt6x1$w_bh_-PH9xZnkTZkWcEv;>FdvKH;=sX=GPV^|Y8OJxV zMeCJ6dm^=Sl=kqg%G-;V^Y#7fi(B!e`B`H7ys5IX_$HFR03G|g%6Rc%TFzhKtMsbk zLi~E}qa{3tuhx0R3iRNO*v7w;s4)M69`8n*7T1x8qH#$hUs}h!NnI07@GA0Xz3RYH zWM>4VM__P$adUA9AAt3hKSJLBTD-aX#=?kBzkqc;>|)3MF{FDU5`T&oR3*yO$B0vN zA6`CJSH4ZQBgNN|{SLI=*}~LIN|SJ$Y);hw&L5x^Y~Nz zcJZ%p@pWRoJW8aQcN5>|S7||?W<)!Ww*1Y?pAom_OX$X}#l6LS#9OHpkD>`1$W0%Q z@{*9GV}T&%jB(5(GM9hR59&3oXTSnv^hJE0{t$_Npty`EIoB4yRQwm($Rk8b87Zzq z&RgkwALDmFJ@Xx~x|UdI&sYCfaW&f?LiQgb7rjiBPBWr}EUnw050s_}O$@}gq;W9b z0GeNfo#!gAEQX5P@MHTUqUL-BAE-0o^%=a*-bLT{XaWqy z9i*T69VGfN{mXaweUH9n8&>8YB$C7qYUrI#qX$B{tfSdh%0^o`i_zdP&(8Mad(`t;R%kdq`&zp{(^U+hY!$OKf>6v6v>wF9F7J}yK9xW9@G=yxQZUm zcyhZb^LlLV51?@y7}aiorKPm?yXpPzq2=#nwsRxnVGX(3_ow@scKILKv9bolnCP5z zJ7r!5o_)+7HZwwR0F`$Llkl{h8eEJH++1h+YEr~((q3%pb}@aF388B_w6C0trk_QN z-G|N_o5;9UdW-AlXEd;vGk)x<&rprC=N+%y%{+VUY9mJH;Lf?%gK{5|J_uvRZ;Lo- zMk+cLURT$#nSM5SD`QI>#xdKpY{<(Tr0NK`iq@(Bt8sRW0H+VQPmM&Dd#!Swv$t(| zO4&OBb<6Kb*fMM z{pr@>r9^!65{~=Q_H|C@+Mk`-dUr0vGmU2hm(L|)>qwEb=6obMHDVJ}JP(fCo6kYvv6 zOmACW=ab2VDl8=34<38VrI%EZ=k#(-*Cae*jW-P$b(B3lyeZ!@ZgUPMnlZNKolXS6*_*6fy@Wt-E5T&$ySyF$my zXFF|Md+lITtH^b0ltSbj&*W2H!Iu^%GF$$I^?H`K=tQk0ZG8VM-V&yqI<4a>^G|uT zPZ|6L$Bno>6Z{3r;7*Izq|+f!>vtk=FSXx0rEd2VZxh>>myU`r-m!PbXZ=HR>O@{b zd&TEZ*33Svz%AXHE#tk?KCc@iLIoojmM*YT8R2S zRV-SH%*C?6@4IW=sI8RN@>-tx_F8*OF_tZ=8}rP2mYMY};mGI9TM1vR^BK{oQ(6ti z7v248y_TJRn|l$QX~m@n!AW&&TF?8PSJ^H1KRL&|75Fm8&1L75(faIV8|x!~@*_IG zk?QyPwWD4Mf7;FZJzv*Dq_7tH5LgS@)229;ooZh$(K7utF0Dt(=M%nkmHg-T<1(e~ zgWsTChE>(J%6_a5{O12g-t_y8X}v;>W+Qvr#-F( zLM_Io$oK);wQ8d;bGG5o!?NBSJyN;TibP!x&k>QyRm5|$Pq7E|3+(#3h;0kjSL@No z)uho5Wm=@`+p!d`wEK1JT|s|i6awRp7~v#p ze)QOyb5OJ8MoV}jL!3ytd$G5=0!wvUs!=V*>GxzxEVh{O`kw#wH8TcO#LMaIoiaT} z6Go-$wlOiZLS0a2XabA1^O4@ka5}--NHJUB)t=gz4Elp?rzX4TwePCyqVYeBWD@vN z?$g86F10q$T4jd(^eox}dt)FtK-nv31$)7_8#H=Ijj_Ly{f%%uMq0UD$d>)&LhxxZ zHXZxZc8HpfMUNUWmmF^oVcoe1nO#7+6O=!Sba%k(MkIa?s0Xo8Jd9Q4#hhINgBKxP z`$wZLt>UbDJQMsy9`; G!&R4pHV9mYv$x>!-DX5?{mj$nOH#QqpbX_!-F9FKfRE z%fNA%Hb%it{3;fLd4lw7;MDJE&(z{bq3x~~eph30KZ%9pJJ?zarft|sz7e@@##-_! z_FsVygI+1dI5~{WFT}?EQrNo~JK3{2yNk4XRQc`Y=`BTvFp3jlPV0j#?b~W!3aK5f?SviSxZ-(m?=;T4JIG@>rGg$F;Dzgzv zYpU`Q{R8xw8bzwR@Zfj?YuraG{~I)Z$9o$RzZ|3^aP>6S?Y{#C{d%t9>@`>wKLv9) z(38&tXcw%=U=Nr+*^+c|CGw6)+ayPc*#_QRN-Tf2D{T|xVL25Oh){(}N z&L#JH#<9)0o3qe&&(_>rSCH~Y(213_3ZqsS ze?eU-FJ)Pkk2h~8jA@2GIz(^colgo3ofP zaBb!6iPXaT%x1rtR7&sbW_>1!W%SjXDAlpd`RW96batgjbwoDX0fR%F(d%zp z#fUcEHS0S$yGuj?@vd?S2zH=N#!1re(#Sij;O*S{J*KnjrSx;B)v*szCui6*H|BP$ zrK7QTBlF;F3>4mP8u4x%=_|!acw0r9VHov!f~iGUUOLODoa#un)kg2ly<2jg@61_0 zZv6s`_cjKi5qR4OTBEM%zZOsbQjd^Y?U9t;>ow74;`yci-#)ogzq}7tJN?pxPqCC} z=Bn40+AE6g>BJaMp5s}E9^>Kt2B_oaJhVhOj?)-YWes`f&W(-hy~dj4y_u9lW-Z}MPjfA=9lnesZ{_zj9`jvA zdh43fOkJy#vTuEe%&M|H0GCQRW)-o@7E9DW$L)P(*$ux~idEc|iGO#gK^Ilxlnp;jM zI_*gFx70k+yqHgv8g{fvBbw74`F@sSV;=FA+PArxfo9DPZ0W6+b(v{=NrsKs+!J3| zt2g5FiiBC_LZh@LF?BnH`TEvUraICMU$ozZAXw5{Al?j$OH}Dddu&mDCxo4jXVfy_ zuXRahw8z!fSqt9yCb$Uw`ORFv*6g>k!+WMRm+nW}m)v+w_cTh`H_}E9(B?C~{cc-` zZw~tn{P@y)c*8`kQVFej=Oc2`j+b<8Z|ks#){{Xbe%UcR0sOXiyx$+MTPDb?HsfkT zxzxtqxm;$dNjY`-d~e(0Q_tpNy)gz7zfZ15h28=yutX5GHs$J}ttO+3a$9?(TyEwv zADjM`c9mKkj9}%m9=ii=cKuTN2ohfm#trOim!P$Kw5aKn(F=Rs(r71aJJ1qFJbLr^ zW%(7bxxH>jWXl=mZ1U~tmRqmLduP`65eW66(fZ!6P1BGB#m(}$?sIGpqAB-^pf**aCS;74u5_ZQ*V$JSO_hH zHLZ1q;A?puv69vvbP}Xm-D?x(HB;BVQ@On!mpzeHK%~#k2>8Z8p=ZEati>j1wTt)- z(Kg0uA*axSZRNJ!-gT!}H0{!bWF52YceUtSM%x{xpILymZGutlu}49>3hd$GV~y-% zW*cG5lpG<9d7!P`0BH0Qv{mSlXRL(<{EgP-uFYkPdB=jk@E)?=YCrGjFD6^;{JQqp z&&ih&OpMgwPhTL@%5|FCw3AA=W4*GjWaTV8eYF14&U#V(h+i~zj59`uK2!F{gJ84Q z(I;pgbqajbZzJWgr&~R2gUS0%KPde!^yLwYvK>M9R@7Uw-1=v<1Y1s6*~$Bnao8(^ zj-}4*inz?PQqIWDm$3=US!PD>qZuD=Z=PF*B}80Y?i#Cx4D3%Gy>S zqA}#W1iRSX^n&(2>#*n*v9ndLETeQU&uykg(J6(RqhFM>&+wl$zK*TL_A7(KU^o>l ztJsgYI~y2-w_@|@96>KfCESW{CdQQ*)#OMG3-4*|o*gZmS&h;&k28Ag#A16JEoGQL zEuW2Nvy%V%n`xQc+Ct>A8FMF&y4A!{%J@Oy&21lS&m3`^^qd}nsnfw|459OouV2zW z6^v(qZeLbRvF(VMAQIm{E6jspM7Ow~2T8>S?_P7l{p9d;$#gl6fo*wJKx)q7*HMp|g4?0H(t&@vj zOGzso259>jfU+u2C@07rI$Z5$0)NROyzh|x3> zy3{|!e&p|bC@js}$+6wBW?X6CTskG^?gKg^*QqqE5pI_r_9i(?2 zI+fxaD&lRN4Azri)tQtdjK`fx#C=iLqr5fqsG4((yEt{y??c!Uogb*7w#(eBS(~(! zx6`rNTC0W5SDe*E#7Xx==nT<5&bC?Bx?{;+XrTyvhTy!=9!^yL9BHDZ$7nHI9Cgdr zR%q$bn)Mk=xZ`fj$F$2Ko#H G1OEY!kKwa%m&5UL=i$=?6*Jz~CX33gO`(ka@>R6WKk!5LQjrLuZwaZofjD!=Y}Z~FD? z*Y|(VcF#TcoSvPI6tpE|BYT-RcF){Asn)+&7QO%8&-EqJvLyE|?<^a!`He;9s8qxq zsI86r`w#UWJK3_nZ*Tve-G^GXoI81{@9@c%C42Vw_nq0@GPiwB``i|}?alP!|1RG6 z?zdlm`-AiUarwiabbhe($N%-=(g(i&7vD2|$^Av4=R}UQ&6eU;+=^Rq`#5g-Mv3B9 zl!?1rUhhtX0V)R{Eb&Xl=M;#{pz_%Re78%(kBSFrj=JDV+TCMM+yz`Vr~+h$uRh3p zkd^2w$r<;KVhkyl0FS{~qR1S)_DUHh75w!T;ZS*XyHeul|>;v^KQXf;^?(*gm35=11s#q+Y)%% z1Sd=R+#nOdH=Ml`Y^!C3OaXbK09%u`P{HoT@MWSb47HkW7<#vWV9hI6D;% zH%J@kHcN*za%Q~c@y!vsiY%LcL!+f9Dc&ogRlQ7-PFX9P*w@NZSu8!WP-el~2{M~+ z?g3+krJ-|E3g+XlbIW$r0bM2ht(8$wtqD4{f@!U6mxtv6Xg3LJJ|a66@giwQ7R$L> zZi3H^IGl;oy+wS1ZVUrgtK18H=17-xgQAVU_k(E(fA`2q(5^)WmvBuha+$LBGIwlu zV(2^_fimTAFnrDc(wBmA(?PWkJP*hY-k0$*G}|G^RSXyAx<}j(9I*k`DEJH@pT;1e~&DX4f2>A;myPfcF_2A)O0SL0Wt>`{O&(?@a&;&B3dHy#w7;F%8%*TCBzsMU(Jtwvj$ zz;hPq+X@|jP2L9mcw}_Cj8K0e-+19EMnkz$9UqO|D&siB8kfg>BTn|YT5MjYtOU&} z-WP#wHW(iS$r&_w9dvs~{!DJjtMVrxT*1|gWG48$JhBtlTOU;AvE3l_VpkNV$qqKJ z9ej&HvJHKBP`2>ekMunu*VvzyFUs%9U(0Lg&qL@&4_9`>`NpiV*ojG3!fHg|n^hwV z!}(iowLWflZjR3d-xMgaRpq}A**gHX^Ky>Y6<*intMV@UOX%-nv}PYN+6@hBw6)wZ zBVp$RF49*GmB%2hjo9}lj?1lP`0A^Tx*+&PU}SM@K>Q z6q0uiPS1tP*X2pR>!HnB>#t^e8KE^vLc3lSu{buFJ4>bFa2vsVs-d;wAsSlofEb0svJ zPM;R@yDt8A@Tm@sNhXfgc_BaK3|K`dRayyPb8v3n9 zPc|w~*YbA@pYP+$2DLmJl$IT;hk^DwdB>(e4vhp>s(r3f3p1AArF}y0raoHQb2hV% zW+=E2s;vUoW<|JxqxBZ)CiM!|gMTix8;M=>^28NF4>Wuwb;PL98CMx7gBf6`;cng$ zI2WJs5pOts@k5Bj!5^>w}a&?^;PH#)r#i_we()~WZB$U&ItCwOX!gON9> zcEz(CPO5GcYGGWp^;KBVIp~?ewhUAY)iSK+(_*FMQuQF3l@1juLAm}x+)Hrn7;Hwk zDjnNA>oW%2O$@x4m~&50y^Im~pflAwm(Uet+fo^eXGNT&7jSeQVKSsWfPT(^A+I&<@ExcBXz<3QS`xhO^&5{{&`2>>jl@O zRL?z&C#eVvvwN9drEf>=YDVJ;>Z{J@y^WEt1AGh6n1$G}5jsz48YCv@Sli(#7IR&h z_Nc_3b#GzTo3;dwjyRR7Z;wGa5l(kIhX*vg72{H`mfCjDuBZ zgvpP5*P%s*f^pGAi0af^GqdqY;F=E3i5!~<(J=KrLM?O#;D%>4!Lj!ZaeG;Qngo=HnL{?wI?`5noIY2F(-K~y z);G`s-=a-Lw7YYwKB0v8u`xUmjp^=3gl=f7A!&(-&+uqmD#e{5*ayKPEOo99a+oFxpATxdj56aeEknI4f5p%<Yt`E=D1Xw)yVB%jv{IT?A5X60;{t`~)CUw&*Acd&?RxO$$y{vo`7J}Sj!#mV8IYuC9Po5eRwMo68_ zsY%*wuacXrGK3LziPp%S0Vymu$zsj!b&yl&YeuSB^G<@UMRPUEb=AB=Gk{Wh#X$E= z*wyItDw~r-iNWP|p!<5d+8xMPsY+d$G|N=nvh}G9*gx|F^l&rNa zcfjn>f$;@dj!Bz6HA1u8O$N{?n@w&q30h>LG%CmaK{Im_5x(GD(n^IBNoS$>;ubS$ z>paY=U_H1RK{sBea6C@Na?~uNWfGr9Np((cT`p~$SjJ*{{Wso+!fEH)jLdS)BCmy7 zO(2;7l4iL_=1P~$mmbZIm?v{Ir@SPOR;DE-$jotVrI2s4BrajSlzFTa(di?GKaEduSEM; z0|yO9CuzQ8tLDMB%Vf^20ADva=YwuC-`8n2P_~AJEBZBjU#%IZ<$65KR=AX?>{M!{ z#I0gzD%txaZvss|*^%IV26OV-E}^0Yk4(Mb5)uK5TJe4EJOLXc4slN1dXt=% zi?SSiv!V0w92syP%piW13Cy$K} zy{urg9|dQpApd<(>rHu0UXag#tY1#R&uel;j)BqKw@Y4^AIM2rMK8Lcc&S>ZP=?D? zPmMk_qfuj_(`4zCrD~Iw!I^o;Uo$9o$QIf&bsV*dFIQL6nP&;@b2lh%fTvTd01ngo zm*lJ9d={EsKn@?K$7`VEH|6{Ac#^y;uh4!QSkiXZSUrQnN*e1OC~ij-`Wz3})796b)!KI!F-C*=%1 zcnwM}gTH?d*6AEISTD+ra+)eYHya&#fY(l_e~x?X<9>(vc9{CW8ybR119y>QF3 znPn+k*Jt#%1NlA2)vwEMYKHvd+{sAF5qiH<&dO7A2nu#$5vsGSXlUcTSS>^=zkQM( ztwbN5;OIrrJPY-n1Mm0X=lf9g1-QId@BNni5v^`TioQgv!>#_8MpbjBVqpE%_94$E%E{wy-{m^_4p)Mx0Jla-xi@`3J!UT=Y57c#bz_cMIHL~oyj z=dW?!PeOrS@a%?D-;#fmKc&q%eES=mF+R4Dk_z}W3VoSMi!0>-NRDBnE^^Ifq^u z*UzZVd>ZtZxb9&%RIB_=!rdM`h@G0p{Rr}4>b~~Mhw=wV;a|zmXM5_;r@woq%q~dA|(p-$0jt47wl7-+=O`XvK5L<5jI` zFf}X2ZX7_aYOQS_HCOxaT*JoukOYIVkWl{JqMzHE8%c+SmoPTad4M z&GydqmRjA@7tp=0$#;;RAIe|CmmkRAAxoF&!40n6jeMP=-N(V# zqcgf7f2;}lTf{flXt_`8GA={2PjUZiP;)xkdXVGe&~62k-v#w1KzHM#=Zsxp+3T$AHu$vEdFXNQYF`qME+{D0xz3>uKJOk^!E%LJr92ms9iV)g)T7y zzl@Y!Q91k$TKr{2bzVojOZ+u;EW2pCQmb)Gb=;_gTTR%QCA@Z_E33d^Jhm?Su~DlF zOzq2heBV|^;%X?`rnQp!{OO>gHU-_cf#@O#Pr%{Bw0{z$C!o_6aGV0;o6zuWB=Q6Z zpXc0O`msyf+)t}h;DPZ)P3EzwyJ%o<#&4O%H?y(7t?HAG(X8|Z@V~_7D$jx1<-E^V z?>~305Y*HrwDNc_{5TAjqgW3kgQubBIq*H9w7h{FzKGmi2HSPDE2bw0(B#G3zuCqd z>S=W(Et$FBc<46@-k8|K2=!8`L29aqnzaVR%u{FcsZxE_D)m*Jym3#XKb`m*Mq-Y! zA4K;172jEvNHZc|1*x%jC&6Vb%OhC0Rg8*@_&XL#l&F6YR9Tz$$01M6Hb=Qi^M31S zWw_=~oBa72K8@jXn^rwF=m_WKjkdWO6PUqBX{teu1Req1UiLne$~MHGLSFE!i?6O z<%DED>mwLHN;HBo6-nxav&WFVe#W=m?7isOE_i-gE8+HWv=>kF0CIYq^T*-y4v?(W zcuW|hDp9_fIp741fz;F5OyslcXL`Y8xZBI?4EppS->u>JHI6H7til7wIGsb9dGieD+o7~F zbJT8nHy`ZA*Ejvz0m2!`$V82l7`^xMjK4{Zmo;ild>3u@qmO&_wHy2XFzuh^Xa|(s z&vz?8cL8pn=B%-5#y`Fn3Iw*x^tT#ZjZkF`dOAa~)qw6SeQbw2)A?lhbcw$9LgA&z zK^NDLRXgHmM2kc1a5a|U(ZebFTf@h5TH(8${Um4(@_vxf=^}bzD$I<%4dcp6XmJm-zia1eX<5ZCn}PaT|_Z>tVlhi? zhN)uR%xfF(%Q-s^em{?0F@BTrA|B(r1xU52O*J#{@+f|0?6a{chPS2~@jdMMVtU~_ z?>YbD9CFO8c^miGuCa)8dtjy&fL$@Z0=iSIg*Gy^~bRq5=4B0s_qo%dekRaq?%{yn5JoM-DDjV zN^M?$JW`o|wZpZQ3BgP|seCt8P*b?4iJF>P?y)+O)v2$SjA|O3JhAz}b4w~6Ycy(Y ze1_Tj57!xGq1D~-S{v8K6Fe*L#xk{9up3>SIa9uK}+zX?XDg9OO zETb}ws=3j8!znm=ny_({L=?Gp?K(zKlajRcAdX}{MQQL=x3J3GwQB-SR7a9;1#a-Q zkFc&T{uBexqLe7ksYP! zRVV#he%4t&?H+oqi<2S$Zmi}n5v{7axABxCr@)%?!B(>~pIeu(`fzL9D5S01$;EJ^ z>ezHmnLnhe$&Y$-@?=ybdOYhDIb~q{UT5nYotC-Tankrt+*96 zx40Fz;#S;>TX8FH#cgPF$9P08cu(5ZVkh=Y7|D2USt7%uaWmpRu8ni~Hl}ZOUt^qJ z6hRMi1xqyAn2s+e2CTryJeP>&q-D<3-6&#|-rS7drDO7b^c1mVw0e-xxOqB`=SA^0 zH?|KPD@cC|YhN;JDZHr0BJ4?a|C|^tT>BIwKNhh|L>ZfI6tzoN!$_XI?d&yc$M)Kf z{`*4Gu5Wk|MmJAnA&fAG{|&*jIsE0C-Qr0)&J<*-mc-7+>nRdB_~K7tbE2qUQYF!k zo`RULYa@QFHYu}rp0-$OulF?HyHqJ6(OQ*juk>S2gO&_wNo1|GI`$b&>1XzOc7-MU ztrt-yCBd`RB&t10iWiZuI-?N+vH$FG3CS?0TkW5eHJ?+0~Wc zza519PibyS9A8qOgjudxBSC-t=y`habxAiSjW^H1cO#wi?AldIQ3mU=o~{iC&+co| z%8yc|QSjW*l*&M;*+I39$<`>d7Q~63Tvw)P78TMW|Igl)2UmGr_jA9uYk88MR?sdXfk12mArNBUgg{8F0!b{w*nru* zun25JYcMv(24dnRINl9*?buG-PBJ9!IO%xmI9}pRQa8<{Z8}ZUxJ|co`cFKQ>F@l$ z`@Z+R7sz7LPN#!3+Md4m-E+@5_uTv4`}>_!DdkeY42D{dA&+OXj&IxsiF3-fIf`8b zK{l;fBBOZH9;fXb?<6BV@|ukNq)r->E|+Khh16P{t%Pk>Q2nW>46SgW+N_YtG7a+P zd*>qJ`ES_P3klh5)FJYOG>c#@`jnf-`%2G7mL5j}z6|9_ZpBDesfe-HK`WnocJK0f z669UPWVCcL|8BWG$@DY%9NsC1c*Rnl+becxoJ^5=axzjUERvCV&!qWW$ai12YK695 z$=VAMNTX-Zxjf-6B8Iw%=c2XvsW?Q%8?BT{uy(-CcE;nP&S4>jEmJWRb~-aRLj^CC$m7YSyA82C%!lz7D=0In1U z5mRuri*cwYLLkc1;c&(|tvfS>ym`gV;#-Wi7NYGn@`yYnpON$OguEapV8=dasuMaK zfh&6PTSAV>EyU+4+M$xXo#bh80ool+9@Px8ZsNGQUApo5bo6*KYfmg-7UBs~25pWc z&mx+(9&#*RIqYLG2FWtoOcdY~@~oU8W1)*YkRI4^qO?J_wfNb99?XJPAA^Jg_^pN! zFJKvv3219Mu4^GmZ!x4Cg?BH5WOImTUksb_-KnqQ1e?PqPaoD{bV`T*6Oq?@bHDuop@A!gLnQbP^{6zi? zQamc3g``91`EJO1KV*9vcRGeEH{h}>@cO~r}BOICwUjT zI!N62K77}oe=aBG4!oxFmE)a(U*Q?MtC1k*^oi)nQMniI*^75*_VIqmcpGeSH}0kt zN*2RU6iK)qmN^C~`sEaJLGOly4e+r5Mt%b0VLJNqiu_RChg8N~l7E(uA?rWNtFY;# z@*MN(ABA`St$YzbPa`fg;tzvya_Kq zhu-gn?8|XQJH|;rep6iX5M-Z-{x!k}SI8d7wi;t#5xn9qc|cyq9S6~e!*WUf2tIii zJg13OPln_{c+=-_|1(61)(`_1@y(OM>Qyn$YPaCJ^TdR&WS;*SkZ=PmaU3H+89$d` zJwAlbv4|tHpo@*H^P=e2PW0$Lv^*QXZ^sDg!GEXW)gieD{eKqTGlU*h zu>#6?#@b!76F%KR3~Pe*Y{F^WQui5f3cTQH`21}cy94BCtz)FygAs5UcKE7%Am4%4 z?Lmu2p_Rv>%>llzm62vF`d&m_=|Oz%Kzobv^B(A_AAKB#pS~*J#CUi|E~5V(WJXSe zzYNK1u*wX0^(d~43W>M#Ka03*x^A@iIGI61kZLtFH^>SR_hCf;6}JHc@h@Ns)`ahK=d33m`Pyax7LMf~Iu+~Y0zxr9x#={B=X#Jr6= z?}Kg>y{Xyp6_wUPM0geCRrJ_oyjvr7A=hJTVbfQjwQs_Yw?Y3!rr5j%slEWIUxQ^H zf@Vg;(&cz}9qW)qh~xHSg~EtjH)HHR0(ovAx~~^s8_`eY5q<~q6`4kpZ(7Ve^D){R zhK1Vb|3j?r)5=`*6Diwh=xqwL*oNPAWwjPk&BaxZLdqfZ=oR@0-h2_dP)zmh_+Nsn zSH+%qwvd_Y@w1&6J<9ypp|O0TgJ8*;2l|%wisTeD`3SxW|K7==aW}8y#eop%0o-hSt-PFs){h$#o7p_n0^3KjmBS%ANF6tya>>3KIR$ z$6RP(8rBUu7VG$wrfWLQLtjNLDVkPmSJZMHpm;}RWoutX(zZpAv<7jehFr;P;(?Qu z{6?Tf4$x_-ohYWyAct)!ptyD=6y5C*=fEd|AMi2gq5!LV`bykIjAQ`Mls-u zomMRQNa{fm^@WxrpexvH`n^{2sm9-9IG*b033>3z4zd}>~L=61aDU5S}IjE_eUUEar<UhYct?Tn& zGbJq~pS=niZDHMuFaac$p#?^`iH^^miySp5>>oFm>ds6Ll1#aK8^Z1bRvf1Ajz z*ay46CqI#(`2>;XBSgaoVBaC=ZYiqE3XidfG<_*76x}JN)8)vNWa=gImd`#i;eZ7+LFc(as4nv$m0O)y-LbGbA~HdGmF7 z1D}74hfQL?YRFm2RoPhLe08-n1=@^o&me~*w1d$u2zitd zv>5BExmeR&fF_lHqby;qI8q0xE3JLAxr?Fi-h#I7hJ1<>S4{f~v^W3>HzQWmR;oJAK~1pwU1WP2RA~B&5->Fd2w2U>sE3S z4j>+FgWjHn(tLkW6*7GClttgUW^ zb#H=3S|C#mxl)bLqC>>AJA34F-$0+enjyyljLN-ecRioimT;xni&t)D<({?VrZkf8 zc{8s03}(;sSd%#BPIz1!SGSX)^S#i{GU##~EHww#o`x@7Gwy_a?t+{ru+k{NY;+mx zjWL)P_P`=7d}!-5%VYXc##K{~g=I%-dda@r^E4ODOWA0jV6jpOy-h=R(DY@p{`($V~#%seaw+1ChqKa+ME|9VAf(Ku~{n3%ka8h=%x?4{5te{7=Hh{ zDRo|Q#v;ylId7Wjm?@tkuYV?)%S~9*d<9YDhwznJ?!CJH=Q1H08FR{@EA_8FXzv#I zZyO|=f_C(|wF~b%3y*Av^vZ`D#>({x^rjVBdQVoEPNOxE&R`|g>3rM#Dg0PeNP7^n7a&SLh}H2?Tw!3@w@gjoCG%}a zT_LMYm$@jlXe4av%nI{e`4FGGsm)pFX)Sz8YiH?N{{Z~=GJLv?dsO8%J}1|x5nSJe zF*6Z;UPaE;B6!>`a%dN^uH;_E>p{qW8~pA#aXL6a|GwtmAX zOTF40fjnPBTyHTkGfcixC-V&)WW_KYfz4dkZiB@}FndNd#>8ygEp97oy8DUQ7-Jpy ze=RgwN$%zb@_LlbKMPXchgnS{!9M)dD0B~6=wOx<-I=OjE2Zd3FUIT)jDn{y{?8$% z55VICCSm?VLgqEhVsmi64vg~2%+W9lGSDWDOmy2k) zvmA%STQOIhWCno4+%M2o%P!1=r^sWAlWCF9`8|PNjpq1svjL4k?{0!OJ`Z_H&3vrt z?}YqkF$UJ*$`-t5BgUWBa?_u45YLP88Re=ui!+Aqiu$YAxXisc*Zv`9mQg8H8(KAL zkozV>+wK8a<4{>rgcUTwW@AdEpot5WfhT6y{fNn@u z1-(^pk3G*a=#t3#sP9C=`0~*O?M`iwCjuk5Y8(Y^j^#683G<<3`F6G4UGO;D2yg|J z;2N)xwK2=#WttP9lkWR+X^AGZAKAWd4 zF8eN3LnUN64I!R9xHYL$Czf%>|6+1Ob;rM*tFK1x%}(b$I|^6SL%t~6ha&^R9Ani0OC`lfYvAPO)XTKq@s!EtUenFo_0$xR?7(FV)^XV*m_7>x*Q37m< z+xyRM9N&p`z^h0Tyef~zug{Q=M`kkkDZeYz#qs4ubTZC0jCZPQ(oQF?UL(xir49P{ zb&_Gt-?fUepX2KIxF0FTRiRG2SvOADQ1HQ|ek!MShm0TtnUW{}~*0IwI)*E`!GW-gmS!Yu76zrxW*4RFTU~c?Ly2==u_Ouv8M~UeAX?n=ZoTDaI-9) zC(QlUfVb1?qOG%h6_!KXWy~TjJ7P0gg^7}OcSc=Ab!v|=Z1*ICd{?sGb&z{*VGjuy zu=`6ck~rqu8FQr$dQ}QSz8#5x%}E-x`Bof{GZA4Xq@ak0``3MJW3x90JgtN*vL$M9 z=phenklZo$W6a&1x(KBV+WpBW{|;LV1#E9pm6b!(Z^YYUP3{SXxL;+wz3dRPCkMR@ zC_&$DWWeqjhkXcixAkCJWeK-lc*NUP61yi>!mO)N?%q1yuAi~`H@@A!fLBXCWcPkT z9)d61G6ak-|4uUhP$;7^NGcnUA8~9fTf$#WMrFz6Ib!CkL5~e0X%$;iS+tTq=451B za(p?LLcW|bfwU}8#+QY}-D!!~z67j~iB}~hEdCtt$a)pUPX&GZzJWB( zLfG;^0={fja)sR!leVuE@M`oVeaG>308@LtQSR4A?VfGGwlk!0$E5cZoNa-7^=*tMrpX z^hFp^92;RIE$^D~jgu5t$?-fhWVt==h;}SPC1A&V7HbnI=BJn(5GQSCFSYaPW_`_r z2Zw$9BxCIp_M%6MrDeQY=IML5nfvttkA)Pn3?kPHqx{S!x1xf%OABno4S4>UxYs%Od_Pwnd`^A@_5h@y7%gsdf!rz>%L&goorZ6FyOPgnDrwK7j?@cVe_ zltE?<-YZ*VKICh)*yt$!S4dVxp=YObcB31=!_;ua!gLi=jFT{lmY1=r=u3%%bC%)NSZO0N3!Nf%r}3!-si3#vUnPv z&CwsTxnSK)lG&^v6ZLqI=8C5^{sxQeM^XXoz0xw zD`cKbVy@=|Be5Gna)`;DE!&~yza+IU#%CFHOuh zI|^6V;?*knN(Z7=EAy4-+6oRyZnC?Y2z%MHvvHSaWSGd|d61?P{oX(<(K@oqv}(*` z^1BwmH@6ed8YI522`$C>EE3@e&x5`6HO=fgNA}4fd0zfbUO_ZygXEJSUk>vdFG5QR zvJkcBwOq$KlB&z*u5OLx>n@Y0h-p;R&=l70=|qpUCWCSs*1(H28*LQ*-6D75^%-cZ z1h2Rida{W@DuqUBiPkMe%QKjJd$ZgvUzH!phw=n`Wjy-W3~e+*?qOI=ktW;Vqto%O zEGu6sPi!({p7!`&^l~?>s;J@FtTLlHmbWk$`Y~8%1FWh{L4y_)MYILi)#JYPWNzg% zsx;w>Vzj6gblM>6Au`LBqWw4J4LN`wk3z5XNvRtVzZh37!&R-gO7R^o(j@9tUreHK zG*Dsk+GP7v{PAW zY9Vv2?}FDIL_Zf0Eq?;uy^3{K6mP74+04;0o>ja`EJs#38u>Q6towWvC1Uzc`7)U| z%OROoDq0A?>xJJvEbq$qsH~hgTc{-FCWkx(t>&Tlp?S=puDG~* zqKQV5`KbA}+sQi2;k;7HI(IejZMS-0+&)_*@$K!f;A6x{DmQ92?6`<5zq{Z!*=Cw) zGZRcbr2HxDx&}TJfhH9HyaxJgqKy^rT?k7hh|-<`9V@D;guJn4v|Gh0V`{yzoEMA9 zriw6IemS{;Wkl8Gai1VTc|&%!r{4Oa+|TM&osdpzRxPD`=j87tW=5FhW~EtT_M2|= z7Z^iRxDK2T`F29WdBhY}(QXs*AH}UrN2@AB2{{QRWEJIb)~Lb%MswvCv^Cb$UuwuV z&gXh9huqgZ%dZV_ylj?N;lDbo&4+vop+!ZwZ;{94ALPfbN)=9u{<>sd7r z%fA#h-b&_zvNEd~UFwOKt0G6G1-|97nq5>`gu5TgyeQ*nE#Z?^HaQ{*Sh5K3DdDQH zjH`#Rh<*0iiP8Iq(8X$^8)ren9>}*_UY2*{dve));{3>Y+w?<=+2%3EN99Y_;rjn( z?@GY4DzEf8-~F=(2!>5$N5mMyqO2hh2tk&F00|*s6A%F_Ac6&#N<}T;LRCaTM`}@@ zYPIUP)MBmG)@53qX`QKk)Yf)veXL7sXWFryb~;_=J@0@2```O7SqFC z^PTUU^KIun?dAa(`-ISY9irQ#h;hw`Yx7_`m5`u`Y(9tO6)Dq`-I?ANZ@oK9m+9$x7p^ji z8P$3CU5IbZ=sHjpWl9%UIWo9APJMyd6r+crjN7HoBaP=wsNoa#qtG0ll0? zq*M`S$2R5(bKxBIcO{vU=Hd>9cCtLev*8~YYkGRW1R znULN6y^UPx;2zBQ3d~&X@+o-){_zd{y8D^?iJkyI?y37B{Th6F1$8rs zwo=B-Tg&R7f_pXM*#KN;HhtH$GagqnjXEuO)8Bm}<(2QnGK9`*W-k zSoL*?e}*?P`?Efo4Gk_}W&Vfc*HWRQ^iT3G#*wRuDy$}ws2bmE!EA6bGiXPXM`09a zuEv{E!I@VT$DwJA<0a5rDPHSA4;g@a&49KqBF6J5Y;g(Z42xiEv+y$+8b6u#=P_G! z3*YLqcNpWE#k`FewU*EeOK{! tP1&O8t@Pa?X1 zCiG@FHJjy^K}S~E;bfn!XFT5NOIDRp#My2^@7e5O4PvUHHdf#+=fP@)a|zowUI_Uv z;Y@cHdS?=2Y%O!iEyMpH-o1g0GwY$#6~s3*;;Iv{h8ix9$sznTmD%wtA=zNAmIrfA zJ0G$c6Ty1KhoNY34ZV6D@sPF?Je!PC&3N7n{p^Tk()*y!G+LED^Y`&%1fBn9m2IXz}_0^l?R~Zaro>=e9IWJYVm9i zG*gGUhDl+orEU2Be%$FZ@=aLHKD6axywW89DxNlwRi=!ioDjEn8pqF(`1Bqk;hV@` zIt^Abg*XGlA=Qw@Y6N@GxYKILJ+qcPaF57y(qD&aon9ipm9OEBz4dCgbQ)PQr{nHd zVSZMtH^}2?(eLmL!zrwQW*X!+*xMv>VI7bWdI)zl>oI&)93R!4s69h;RiKCOMW5B- zFXKxZjVsr&zCabRi>q-*Lk<`cYB^f73*LM`>*K6u&`GBQ<=w{tJ#J|=&Xuc9~SK&%wVjuNIDUp8bMsOac*oN%h>^WfjEVkubA+#vP6F`Ka>etp!*PE zpO=C9Tg0VzF+X|&mU@j|sMY!$u4;7!_CeeCV$Sv*`91pfa;|Hp`goe6p#D&dyC8jG zZ-&agRz3?0olCABbwADP=+ZG%jEU=j7?N;SsMMY#4X?t+{H`xpfaTMP{qrBwWiKVm-5P%di?^??=E zKqFT{_gl$(wF~kd!A#;#vdVa@k&{Es=lW+#Mfi?+{!z4MlUxsfT8zG!j$S(pqk)aZ zHangKSvGN1GLO|`3|-m?-3{cfb^&+h!uwY)IAe)d)5`PD1`3Fd8H1~|lJ{yWJkoXn z2M|-L{^>#T{6hWNxUoPFgk8F~$pa{1fL0RyAh;xnXl*E34Uy>xtN%OnJsa z_qKwlMeo!=vYB}GR9Id2fDt4;JIxDDXuRO0D~^)L^Q&8V{+UG~SA<@$hpeG`bh$w` zdohtdgQ2BfkZLk(QPd-vx1jH9xaVZEu6nZO3@1N*EI1vGo~K0j_S5;LG&sErpY3Sm;HPHM zC#v8jWsPxXtH`gMR_L?b7W(L(yhy#5Aue?K`nG4kv)C~LDsdnx{XT&ZgVkA#)JCbD@K8p^wD;7F= zmPjo!ZKt)GhO*F<)3&4)cP3Xij1cldRV+19J*NHM&`7Vi6y%ZXB!g5sdyPi;fKpHG zB(0WU)DB&wN5yAR(}<|3K$dW2z2nQocA?CS zRBMq=_DJ+RGSwnGn^UbJ`CKS6NBz4qSM3a{MLR5`M0&FA$k|pb$i~OiHbawkvTu$r zvt83#n@u*&>I;8u%!7Um{rP|Y@xR;7<6+R!NKWHNmt3>tZDh9C_O9h5y#jM*?KnNc zXa{BQdUQRy9$k;FN7tk4(e=nQlhG_|tLUCo`rJL!Y(zVBUu}0yAOUMbCbnD6ST#TU zJV=j2Gwn(41@%NUgFKnm#qzPO5j)-vuS4H(DZ2-nan9TMx*Bb-Uc{?$_VusS}6*Nkn5)_IW>yi2B4NLy>%@Lufk zaXCk?;{s;(O!nk;Y9B(=R#u(!?(~{0&Sy7>vh;W%yjF5YtL;58{YdM?%sn8XT|!M^ zDUSsf=t9UJmsO{l)e?_K`>7S4Tx7@E3o;mEk$buj|5HcwsBNsOHAAaXcM@C*n@XZ5 zy`W#c;1w5QsgGslqE5CqmckMrjSo2)nR}x0h4=fNAKwxjEji53D&&cq7oa1Z&)XCW zP9L&qCMQ4@W{Qd$>aVu?A&H)e1^cEs%xvq%b1|#$W_j4dn2&DxkOj=a%=6D_b25w= zo^J)Qw7u8P%oUPIv!n*&IFdr1B=?n}a)wN21^IfJC)Id%Dp4uMiBiH0pg0~Y9aASF zDcB+}18bGE!&n|2J6yn1WT$!4q>(6!lZSZS(yTmgLT$=hvq+97?etr}ByB%Js2{Iljb&9jcR-IA!cB z6QQ5wtYN&2RZq7-hnBf$HRnGi&&d1oD|thnkk3JK%g*deoKt=T-KRd1x~I=Xl|XS6 zGCLs8&mr!~{JdVlc~aZsuh~a$Xs(sD>X*q1R_bpe8e~1R8PDi_K;D!O<=^q|_vB7y zXB9K2zB~1hKt#tGjp82Ejad&lp<0o4wr-i8@x1o#krVMus}AvzpT$zi96GCje%xkcWQo?4)<$^CMN+y)En#mel(eyv2$M||h|zLrORj6&b)5~$Nc z)=iIN^l~zsQ)vqKRvari7JYgaq*}n-E0bp)vrx^4=Oeb%5N-0j^w3`VL)kB%B)82t zW?U=1z98Uc%3-ca2~mD|K6W#nbzylxy@@Ur{Py&-#9as}of8>>;`%ecSSsg{gb>k|3`k7MvY=p(s=UuiZi(nHfZuSdQ@`W<{;R&ob!pOtPj-LmR$^ zuX$<};TJLAc^<2A8!y)cW=_R?yjcM;a3|tjLvJQfdd4kf@v|3mUJCsBqWOM{a>(^n zV#+Xp`KSPcKzzS_$bDAAo-80jcMvn2OZ>cE*4f&0{4(~W$c0<0EQiiMdrJk%0dlP$G zO5{orHPg?pgkMTa@Q6q&Vf@MU5svwc0i_)EPeN;)&+6#>(YAry6${9}V;O%_VTaA= z%g?|Au9K@F<#zmTWZm<5ax>Y2UO?oY%1rn=GAP6|`-^2bt}~RQ;7OcW^oQKTC}SK) zn$I~=2|iKItlFMFZopz;9ubx&6Lnf3iQK;du(Mt0EyIvaWX7c-mli>%2Qgy&9e(bW zyYY!F7(-@5yUX$FoAPsvC5D0Ajdq+4ImQv~6K7QzMcm$ayfT#dt^v$U>*sTk^dUN= zKeO9=5Kmm_*TNXYwv0d-BPSY#BPL8XB3zIg9o0<%n|kGpq1P$Z{w7E#~9h znUM5p`4P1EApChher&9(V5aRfw6h5kRWbi-Fm2AVn8uR3Fkby)pqxTRxH4$i(H6{p zhq5MpoPnx@xqAug8!tQI9j!z!nD161I@|o<0Q7WFUX))#!f!#w%P|T!;VQ;Mc9k4M ztbR_uOl&~~aVchHz0t~5=z}S^!bG%W4AF73VHKr}*B*HdO~!G|3boN-5c_)!KHrno zzWXrS(flLM>sf`qi!(D#LJP)_3vD5y^3AyS7bH)s^yl)nyaa1J2l3D}WB9HMpwpM+ zWkl@D=nZyV%b>Kp-FcWN8kWau;Tzk_2r{emCZ9%(tc1&9&pTinJMjstOl`S=lVI}| zWCH3%^4MUA+QbZi6>ek|UuCp4%$d zL%YZD-HDLdYSCXWUt<2}wfJ^FX5OF4Eb0b$YY90B^U?ZIcyAOr3k=!P2eMgKU=cod z7Fsrfv2+;m75VhyDY(OBc&`C=HxBKbPIkF5xO_W$bv3!G=2Jey z&RheHtwfs}Xk9xHcm7+RV$E&C_|?e&fR*kN3tfue%5m zuEzC^8_341Z_8voCRfNq_{L4R`!aadv*deQ3z;Sn8B;;usE0-lK!Y0*xja~5BW-^Y zbUc&HQYWJ~#jnCJ7;U){@)?W6Z1mp*$hV3t2UgF#0aE@w?C=15XeHh;u81ng)xhfY zRx5rAddb+hJoN475f^9a9kA6;puabe^(lv#)l+fz-ngT2FBagH74X=lc-7|VhRZ60 zJ}WU697YUSfxg=&;&Z|FL_7D8HK>{O?kmxT#rR$oxlhL8om=odW5rnqkDE)&o5Bk0 z_4M>lpcTvU+C+>qWqK6WGankLgv8Zkawx^Gu{~{sH*7_lE!VaI*D}73oz&=B=(rX4 zUJE<7Eb_ zXwsNL<{(Dxfwz25wrP$2LKbMPzKjTd4m5u?)&swSmOm?xA+leA9={S(lSf$p|DR+p z?y(aR9+m0(Ypfk=FuUBVg<9{=$BLMnUXD+1C%elMBAP}L!)Z*C6=Z?5-m2vII}V>+ z2DI?-tsbL#MG zjCIC0IE~x@n~B{%mr}Oi-zTA|6A}Ar}^`PZa$xIF{IkWQP@~E*5KJ>vOx`I zoq|e8ITluJv;P+I9kf7iXF>WQ#AV!q|8GGQ+AMkQVYkwKpnG9&NA)TF4aT7@Xv24< zzgyt8=?AhFzs)jOZ`7a3k72W|a)V6Or}dC4bc^M`s=)*ck z2kIBdeD@+|wY_zs9zw6a2wQ$uM(e-nFERg^s4v16_PSTyFn1}eu>!Wxp9l*Hae?QN z9kv^ER0sVvL4T_#X+2tEv1B|~lM~23WBeo+!t%DGJ!KrN%u|MuOXV8a#jEg<5`9<} z>R}zOuOK3>g--6%ugD2nua}_RugRmZ%rbpUU)MVKtb4|N$$8GO5kJA)cLDRI^CRou zJRdJ=RWe4vKbImt8s|qHq?$+6mf#GTNhF~Ig`m5(^DayjhE__^*y+g4%> zQ9Td#w+lV|oNUm?wYN6uF*n%d>h(Ir-KwkgEuHG>+(tLWU8B38jVJK^**-gYaoUR5 zah0eWr? zx*NJXsJBA1b-GlC>OUf;>_qEU!$}373SWx{cUV|-%E1Njac+6*xPB6O1LRGkIr8}DJfEys$I6NBzKQb)MX2hd< z?&zEY@4A3Y#Zw{oYIxQ8(09IH)5~Sp!B1sFa6vUD4Rqy#IzovGciE5@?=Nq&SSCH<5)^|UO9{g~uW&k6Jz^Z%Am z<@v<*;=2S^aiAY8a0B$Tm7HEJ(5B7Z&VwaL%7#+{k{4m^p7ei9PUA z^76Bvzo@@Y9AAr1gokay*mpVkY;VSRdj(>U)tRZ~%BYBu&b4!frfK=>F4!>-qdlUz z^pYW;JYgNqw}L$4Q%)iuqRr&3BFJb~s45QjiZsi~eWp{}@R>*${}BsSM3a!`X1=C| z*fsL}5vrK`EIEwl@hX*3v_MBrPs*`yk#)I_JN*CGH_Cg4tSfsLBEOgeb zoyx}!vo5%xmQ5_MVaecS(X0hcI|oT(AYwkU$~&%eP0f(=o!rTEK~=`=c@mnX5~4_y z1LSylNzyq)qSftk0xXzjK=Xvpx01SZZJe-&kxm`!;Y@dp?gwi-r??u)ya=-ZNeYXP zq#z_ZW`&wDc7&O8Qth>^)5oM7E{#(si5b#P+n}UHw(J}N+NP8)lp~NtLU|uk?nEZ5 z>P#+>M#gvz;bpVue{^0_{m577TwcycyT?bliu)VRuKzfAf3jx*{_XTa+uUNEMQ8lA z@Dlp5_GlN2NY|t5(e>zhbUnHr{~sUPf%QszvzWGOykuMTbR-M@c;(ZKxYukm>ulMV z?X4C%n#Zgi(06;QsBG8LHe~H+B?WCquiEB{zC9i2aac~awq^cK$I$Yfn{mfdYPEFa z;UaZavPq_GuJv8JmY=wn7i0-)yEr7xxQ6L`U5xft1DcVIIlhbFwb5ruUgh-4CQ+@4i>> z6%ftW`N4Z%LDd`X-FKgT_Br>Qwf6E-D^60eo{{#-t)?hK%x+Z$F-0ScA!gJs$|&*P zRr=UQq{6DArWhOXQG*n7PKz|ZcfL-SUNJR^c6#9zKiSjtQ}ueJ2$kMDvzVe*@ly4h zj8or`3YCXS*>irc(XmiV$t0>vUZRy%*wHH2y4vfyO;J^^3)RN7;!MDltdW=$Z!1fLd@b=YRowgW7@^rj1*i8& zrX?+%Uda!VeH86g^-BBLKPlwZc$A2=(elKe)JvlbWbMZ~?w4h_U!_Uiu1aHbyki}v zu=m+S+e}N?YNpRwL3)ins@Pvk9JR$awG$7$A8|PmwlL>V-=-|Lr?_KoFr@I8}bYJANdctT|OsMs0EW@6ljHUD9S?- zGTV1OK1j_r{GBE$LENYuBI}_9Qe)Rm{!6*Tf1|!JSoFrwqiIs`a#+j5?O-lEw zw4#tb))3aZw$-U;$`bhRaN@K~RF!c&E#2}e#L_Cm;>w7hf2%oW(b*QgccP>onuobIrMa(jbVC^Cn zaUmDKGB!S+qCz;30Rpu4#(3@lvK2uGmUZfgT?bxId zn=DJaYl(~t6ForRp|$FWmCHFkfR|ud0;dfaCcVSK~GY`k`K8ZZ(U{jrzaHI)}*^BATkGQ&Q#X1^2_)6Ek*ljGeq2E5f8&q}76G zKs)DC+F?;O(mcNtB(oly)5oy(>@3>W&5E?vKFfNu@prtOj*;U`xlZ1dOLc@5=*ySqGmFV}$jm)4e8-5&dd80=D}7Vz%xIIT`)nIM}xi_ML@Mq*=G=QnSU(G23)MeH2-!om?o3@Qo@8 zw-TKh5*t{J^Vcvkv#H)`Xh}$Pb3HX$BMp_BWS*H#6bb|Cj7?+p{C!!?*Vewb3M$Kx zUsk}TFOh|GE+X(m{C~g9)!*qFv(~(;r|4U94!JV*WRlqm(u3GH2NTPlBTF|&e`Zt4 zkP*jcFzm^bztZIJuS4;73r5#QWa9W9$a;GqSr7-1T{aoNTYirn-ws9eoi8`CBKB$U zQ_79<7>YO&<0LOxgvP~`3T&kE~LgvI~ zA{%Y?ZA7fJ;?rZPB~}=D2f<_WiCo{8<3|s1^(*1Mqh%vz1e@8$t1%WIh4!|(C9`d{ zczp#icn9*h;;LI=RS#8e#>+(f)=GY5JJ~!71dJ9am*7JtN#vMj-A80Y3vIl%VZ^>P`a%;NZL`>sZtpN^iqL;hPv>F@CRzOecT zw8%vCGRyyJr*e*Jd}0z=<_i#mGgu?vDp^g3MYfLqI?S(5Mc=S&$79gKN8o8Iv9E_8zl`I~ghjSC z{SoqSvNyWtNWC53yb;H1CjYG+UK~fRa0|1JVV@oY3oLhX0a|1Oxf5TLE73N~@XjW@ zcfM=ika*sSXrXy%(;l?!462umBsazKr_UsB<5=Rg52bZWi0eH95il9&wz>RKeDhRz z^k?!Tw9UPk@ytT(P9iFKE{--DZ90^7vn}^$3dc{&b2^ed*$y0IF?jsT*s8EZxvp%#;gVkhT*zx6XSq)Y4FA%{G5dN{(HGxo=1QC4vz2;#?N}a zQlG_jKCV~08Hd%GwsYZba%-(p;{?XCRR+r9U#kLQ83QZO0tZu2L-`f!l zV|9x@rr$HaG>c`2KC4S4)u$$nq8r zgDrb=WIm9*>P0xi1hnqwVY1#J|5blPFwcKS-jlbHS@(lq3i~$d?!xLhF11qc z$wpQK$8pOm?Lo`Tq_&b(2N}aE@e^U^F=(%&;n_j<&R8!(wBN0r{`b6RbO&ZRS$5xKDilm0Bf6V$zKBeqE~f(5CeO(n-CHh|hxIORfoIGE zy3BmeJIA}hd(hl2|AL-!H|`JaF>mM~Gr;>V^CPp}yVtJ@wgz3{vEizqzuAC(cBFZU zY>noU)slFI7BC;y5jWk;alp=i$HAV(#L>^@40dm_3@wvt6jiBhErxdaHRgw%$o*Zo z*p22)^RTzXzs>u$H`O~uzag*b9PdxwLjOQ-n76|Jp}&{Ez&|X=hJN^JaIgP_o+-D; z%la-^>{jJNdRgct^<8Svr(38*S4&>>SlDKHgZr@h#u(T#37%`i=RDqXG_!{tgk1Im z#K%Tlja5F&d!O=d_9pvVyt{O=zF_Y3e(OExzwYP!4gS#JkN)-k&;6hJt^PlmydH^h z{!YxvZN-NEj(s3y)`|T@IU~dQS;!fY<+Ha?sIJyh|0 zSO-dtSZYynkC~o^sO{_Y(U-aDlOyKOa>sqyl)2*1p9?vul@8wL_(;y=q&Bg zCowAr$ zS*^)-jP2jT7;o=r&Xp@LhuDa|ybV9Ya9{tbyeNN=WAq|@AES8{D=qx1yn*~(>*fMp zSpbh{+$fuUQg^WBP?Nv)WqSMYph(=7ZkPp-judK<>=$;cFosAqGJTua@tEtsiX zjJy;esz)H|9z|A~N~JB&-D6f>2lP_4&LpYJl~%EkWHrSvi<8c=3Pde(&cT?2ZJ~zf z{#@DYWZ!Qizt`$>F2|hUHOzIa`dYPHRqAWUyHHiCT$MLd#7PJplD_vEx$;i7iYq|F z5P?_Y{@@P$Y{AU)B6x5MJa#Q!xeq;J9ddpv)xvW{wUjFFUrQ?bjHGqu{bb!!lXU=7 zvJaAGe*|v4w6$LBeD6ry|DB5&!dl$tpN!FU3dX1*ypN04jC!xH_ujuR?53Hbvzyc& zkXKYI$5hz$DeR!sa;>F?RaK&zA%j_qBXlblan&!cXsx&Wj=h%Ilhb5vHeFRrQ;79a z+hC-yKV>T$+ztxmRw^w0*w({;p@w&H3 z#i3NJ+88s=iua6TCXQbc2PWAg@yhnKzA+A7nfk4z*}VLB)8mmAMXaW<1Mx11l;~Q0 zw92aGqqh96Pa0L>xw?0ywiWLg@48L9iYn=QPdw&vdZ{d0$Wwj^|M25PhV9xnO$wtD z_iOX9@IH3m-<4lCsebzBE4%z9eA#!6;lC6TaQ8KBKAw2_>t){T>M{3lBlbM@JoY^H zJoY^HJoY^Px*z2$W9TmOGCpd>`QB?1eWR1LOudZVQOK)|*De2!?qp6{9+A9z7VX;H zSNp7nuN9GZ|G#$zz1OWi+I@ods%F$1-~Z9n`(!s_Yv(Hqm3;8$-{a_R^|IYz(d$R9 zsZxml-8qiwQ{$j;&E_w1_uE$$B&}GzzC5SoAEm{5BSzqeiuJPZ+iVUUq0-lqh7! zEAU#@UhNg{@v3^fa_y&98p}moU%E{N{IN-DAxe>sb#eWhUCDjE+qn#hN{IF<3u+@; zrlwitRn@50U9898Cpgdaim}({I9vQ$v?maxYF%ZM+T|IyQXCX%#Fu00r1x3ky@LXa z#82MY*{WOVk>leSw_BMd$0C;?tUZTFF*_Y&;s{LdAx8+ntgC<(?7})~x!8+#ajRXO zKc$7)MB$;%Ro*rwnrB*;)fOopgsDC52+tDmFRwzM?<@NmS3NIcfJE99Sz=ag)wBj; zB0bj+wah3{=(mx5v0{Xw8$vwVBPrT(j$AxIM`gz7B0TF zxPo4+q>!VXImfUF-HvaKV=jb@!i?h#1hh51yQpPmaq74TwXVK@8@y#z2s&A#`ck4B zKQEt@G1P<56eBI`s`^FQE$7ses)*Q)^QuoiNG)xP@F_XRxX3cngJciUlpRfC>~xOp z*#bXJV%1{HSD7WNS;v<1I0mZYww3BB z<(Q#7*Q0VSKJ!FGTg8e7;@f>5B|@=eT;>UT&A#n*HBdEZAW_t9@ZMzBXSX%yKP{W& zP3h4?b)-Hc=i{|T$B+t>nL~zti&zQ~Zl;PYU6ru@e3xsgh~^zc|K*9`3SA9Y<5(eD zl9fY#roW6tZ?L^$1S_lC3fj|%3R@&!K>NG{oR^IZSSn z2K_tTs3&NRZiP>CJfhDykC0BaPM4?7aaO8{%dDbbq70Ma7*gvre*&inW8=V(ix#>* zUu<8LscMMOx7s2tuy3+-qjjed6F!BAzvJZ*wD4uRMGw~JLq8~I-2gi4Ma`k-2_Ra=k-fi!-b%RHdtu~7Dk``(L6?#b2`>UxB zG6c4bgmn{G_qmgJ=ni;t3RQiU$jS1MwCQr#_p)37+nYH9v&Z+y=&)>v(YVF{Ha5Wz zF_*_gSLHT6l1ap%Yb@>|L2bMh4&$#R2h(ANjBm8~L^W#wk5dQ?L#u>Q2DhVuifZ8w3) zZ|hCVWU$ zE&i(0O*m&^-Wsy^=9%ql$w_OZg_Z|nV{s~3Q+rdnVj8t0rohhG@KY_WWs=O1_vID5 zX6HBO5obOfRyA`J8;@wWz0-QpCt|XWy>A>|vuEqVZym^7mZ9=<`LWz?QQ!Ijye!E* zZRloe8RmmJj_@HFkKOpT)vxG+72WWl?KNHSh8>+p;LIM{u?wEOM6cAZ%O!XoE|<%l zc>W{$(td0O+h0aAli2=NkKgAqdIoT8UjmPvimQ51cF0TeA~NhL^17a=vr;3k&Fcf% z#$JMg%`u1BxEn);z`o=VjbdxJ)2miN!s=zUqqVQoYBOB-kq!D8ohCQS@8v!DsXUI) zE#@4fiy3tRj@FNwOhaI0C*onPY?lvY8vGUN0s2*aME28bWP`a|N2l(24eRw+v5zRR z;Efz3tEkOl=UG-+%8tkT(3jm*WSRq4D}XykLc^N8a@0RZPu%h zkNPvlhQa5x`03zSJ{`6Vkf-Es`7Zo*ne@{qbe(=n+q_@uV%=Z{X+AagRmVEC?cAS! zY^36sRR^3v9#+I{wR7jW=(7XK2(*7y!#=CTc9cG4vSzgW!EEpzFmrSna^o!htRAj! zlfV8$xsv&68oakwI&`((g6wn|W^6a=b9#qJ4c(=}cs27;iysiGF;ZMTry<6q~*t-@eyQ=d1 z|NET#n0x2WP}{q}eEJ@=e*-!pfTwCnD>GP#eLd;at9 zz5o6CzW;kkd(1E3iHzHEh_Ar9Tn39Q`8`Xhp4CIeq;=#4E@jOwLW>+{99t%PF;}(i z{&IOy)tstBdaZv!dOY<#^T*}|v)VjoR(U@)7wFmOHJ_2+$`P19{!EWBm+1>~v3b!u z#{abcp44&aV>5TAekSkNpJN7>=Q!h6pmzg?Yp>hED3IY3j9Eie>g(f3wvIZn<6H?$ zqUYGr%&t9OLd)5tcgU6cC4V42=09Uz=RNNY`qy~h(Ra(!`uB3F7S$@G>@rV#KlXlO zmU)l*??`XY^rhdGddfdt|4Sa#U+QgW?~~-oCD(7pseWcT2kYbfvX52;E1E8hOo9HM z)hd{vf>eR4(Lvfju3v#y_Q@u_!kh4a<1I+toVv}w(Ywyu?|shvP;b}ex=kOEht2)| zE&g-nN9Lpc7t){1T$_HicLjRTN%|#yl(PoQ>8QT*Z2L@(vn4x{Sgdm1jVXcJPp}tb z$NPS&$k|#LpsKB9iGE9VDRhJZWtHgkh8Se(8VA zzt_Cn6#bJ^5BWbdzMdh|`hIi)`hfP>5Pw6wd$6R49GGEgtp}F=Pf4OE6O-#QfWo%h3MSUO1hk{$0O5uQm&U5lu zHMGwU$kP~MAChj}XI|1Sy;Xmt{{cVTiT?aNW-kS@SyOd)dZQfda>gdl?OS9Rcj7V0 z53TnZnI;jjZ^GlXh?>VD_FRH>>301IBHdd3H}v6;!J3ciUt*25QqII$`Z2tBFXlv} zjJRG?XUfr>N#*)YyS)^X*v&POY*~Rm-qG;lV~8m4mMbvg^ud!CQaR;P#3HLYdNl5H zAN5M_l>0HaTEkUHy2(6w(KUN2rq1q#NdDw0wF!6iJZ2A=f(_@ul1nh>ya;orGx76k zjJn5TQ&!<9FuDi(89v!+-EQa0b9+m|)p+&X zFLOCbG!gu|smM!LvLF@i=atwCPesKlEVh#ni|nY3iJu$FuR7DrY6-vxtn-c=lTI zT%$~(jWjDX%z{&Dn_fbJ$2j(3N&m^4g>}v3hOLe5F_CfFb1NJ_!T#~9=nKglBTKUg zg?I+QQjL4DT!N6A`6TuOVD_}V)s)5GBI=Ih>~tHE}x1Zw?;&e=lK^1j=ADFI5&{hiiUK{~4)klFPH zomHCdeW8S`cl|DsX0@n}$)rzg-zNF}s^Bz8K}oNjMiAEuh;8sD8?QA36~ zkyA0<1XESzMH@r{6UmVk&pY3W@{)!-!=)?u4zrnwoLXc;{I{vISEd#1L}NLOOzdQ= znaERCcPQcJJ%^V!;dz7~av@BNeWVQ(m!f&cG?Edm%%Ex9{=5nvjK;LK#d$APLx-q7 z2P}>c_Cxf0zz6XXnfaRSztwH1HQF>Y(rDtoeTU;Vk)7XGB7bzQ;MMtSZ7|N1cjvvx zDWVt4kEdbQ{$_eklH-&PnI;;^an`t$`b3F6m+bu}WMTy0oC-AqTARz5SS)RJHc~pb z^dpaSq*r(jWz!XM#j>GJ$Lwaq8mnznhqJ9A<0$2j`B|q0ka4}jb9{8n)3zT`AM#KlBkL>&MUst)CE-vpFVD z@-&i-u!OwNJ7tnCGK9K`B`z_+3Pj*e5$Cv=`UctXti5!l=JXCV%=VviY9c0VO6jIZn)8k@({xk(#+Mf38F# zX05hX;{D}_QtP5m3a-@$_{FXWJF^2vNmJA_`Ei2OCMaE5kmVRrM4e7=@#&}JDP zWvvJNlx$>-_UoL@`|gSsP2T6UIf`t_J?=g;j_Fd?tILy(l65SuK0aGcJiTSP96{9d zDC%=sCh*$gYm3Y*ZEH8>P#y)Bw>Us!ON}{FsZS`m5&gvY*?aBh&jq~8v{TIR!Ys%c zw#gFnYl?a4r-=C7Kz#Q!aX3rJJsc-aG#$7{{7ma>d~i^z?$ZP5B=jT{RLEOOkkK>EcZaaPDC^qU1_5%e+^ zW{H8cIG&}rnldgWqiorOL8Qx*BV<{63!TPVAMNZ3`Ktk!C&+4=hg}u{x=KzY*2^+A zw&M9FJi1`lYHETMsYJ1YY62N5m#jluJY9Cm*|?j<%p5$-wwtXpnq+v@Znmir#-|>l z@{5iclMnT>($wn;cr95XL60Nz#r6!_Ul+(}=ns?l#u4__K|Egy`zFZ~U5fjiq1~46 zWcgzK#9B|&@41@i*(U6R17!LvBr|A$xyWq|=h=G8nPCH^qEV-fRb*7&2kRza$xhh5 z0oUEg-ms9YzwOLhZ*{)567_9$54Ov>@OF-Oru;OO;O9l{%aF0@(|>`2NFPyXJshWs zp{80%@6opXRaC$jVXZEgyX6CjbgzL|t@^-Q2ZZI*mFizE+NtYaLfhM0>u&%Rq;30v0HN;yS7f$QzTx38An z@<#by1HW-GzHuwrI43f<{tfbOYE*p@e|Ni_ESKT=eYm@H z!#QS-{UOge$r$-C{fxoch(@Mz4zf+&EdL-6GF$%xXnQk=Emy%Guaq~)m2$nDPe#v9 zSodZ*2lp`pD|g|U1GzU^&Fu^}&;O;r|=u zeD(Aj@-F#;EYk<%R{6MmMHc8Y_`JKOb$+OSnPS|{vmW|rr|;0t#{Ev3Bke-jD-(K| z9;K`GRM`4@c|t$0Yjm4j2XDR!b$dGC7`aeaodYb0+V)>E0TKCJZ zC8dkCPwz$x=xVql8q8M;)WquII2oWH%eg5$DiPQSz6BoJq4(%V^(nnb9+QvZ-%dGz z7Wf|cBd0siQ>Z{8@)`7~H|o3acQ@+gI-}o4&96g0Y`kBq zPO&S)Q6tast(>vexZd7Fe%;OTeYr``)ED(!y;%QBepmk$y>d*ygPJ}bZS3bVsrRAh z{GHsTXX`8EvvLi3!cU}EkJVAMg9Gv{xmEY+uh5RSaOR*5cDyxQ$Cb40Obi=c%Br!D z^SGVx$qrqn7wR6}rpx5(sPA9vb>@ItMW+wRwDqp+$JJkh+V}(>SBcT*^+EkDEPWO& zJ9b)f8cwCFf{6=_nslLkm!hB2j z;`#ef_wU2ESUyK{>-GBU&}gP_k8Ge()ZKORVR+{!@+uwBOZ5u*D)HUXIB3G(=yK-ELw*5}j<0gy? zUxnZ9(l5#rh#l9f)kpn9#JLj?XTFRYy&du8hj{!mV#|6tPwtiXquq^irIViJTDXy& zT~r0xjM3mMDmYy$|Af|c4QldCdiE-eyOS6j9>IHKh&Nx6AIs%vhc_TvzZYZALb7qZ zc{`ITsidxf9cLiI?cmCzfHC80jJ_{JyWa&nH^Ppcm^IvlcP+PbAG~)otm<>wV&?PA z#B)lA{g}5LjnUq+UF{5i8*E$16~bEhX#{t$3vJ*+dU`K@BbZH^nk%FBo$;!Rm07g( z&FHa5Q#I=t>RMU0_;xaIPeqMeH7Ywp?RD&GKe3xWpXWFMqqE30`ey3goj}#k6P!+( zRVrEmzb>Y2IxAfJT+fnAOqLXCb&R_PqvXFX;%cEl4tQ@Q6VJgt`7UuyN-eU>Wxepj z)smlF|-sx+#UDrlC zd2&!|x4MlQJCoBYI!lMbgV8Gn+(Ivw-?e(S&qHHrT-PKy;v6nT4@oO)6}QkSozf|t z(kY$NDV@?Oozf`>d&M=|&kya)US+P@4*xf5;s4j#tr4SJ58B>W^X*C_iobKs?x1bA zJgO5gH{Vs)T4=Q-PivycXq{7=i|+%qtCdvd1D{=+E58Kl%e(?pbI)&fwwM?ay?$J)rmS857B7O~kLoLHOL3-}!o8`Py~Nbv9>tOU zUWhs@@9)>=1J=g%n?w$?+AXa5-A8>@w%3m#uuY5+(OMmsQ0acGMk;%_UlpZS{`)c> z*K^s3Oob;*jryrqyMCzJ;U#de(Hit3`26@MsR)|Agp%>A^0|yV?P@-EQwc-tMY4rg zi>)TSizMSLn$fECNW~Zm8hSb7aFZE#O4y8begrLD`}azxYPJ2B3R{UE>KS;E8nsB$ zUgG!qiTm-Jv@aFOn^@n%YMMam^d-{H#+36?w?oiooW{5z9=S;L?T`$_5X+7HVwz;O zRcn`l3PF&GKjV626LcF|HHz?4$g437qMPm6k+i#y$5Ez#!^vH}; zOt3M<>H}D-28rX&kQLE~*TY0XFC>P$>NL<;HFh{MKm&{#3&i4$t1A*>G4}BHrIW2G_nNQ#4Xxogel4Yp( z#rPeIR9B}2HTRizWxp_vJDkCFtVe9#_pL-s57DY18@xwOS%&Ry1Rk?m9ya%S5Al~h zk&%Bs{1^0(Y4lHvcwdD6dNS2bme8L?BGNO&tBw-)YscPE#G92=16fLilXk6AD&wS2 z-&iF42;1F}vR!t|sd5y1>L67+meW%sWD6|9Yx~Onv|O_(=F~6LWv9%}#?q)`H8Don z%B+Xh$nl8An{f3ko^PU3%nbP`6Vzm|e2@*elb0iEU%|FG&Fs|b>}y~B28;*K)t_Y+ z&A@vTu;4gcdlXmRP8FaHWJpcm^%!0+hmR-dN!x!X7$yO2PfdWLBzx*u&e-^cFQ(cot7iyEIC(p;nO~N@I=(^5~sSf$Yl<1Ukw}1S}fCH zH7wf&n@%IPejWKiOXU*z9n|1%*zco_ogy#8eb~&}Z`<>Gw}MAatB!{R^#L>)cGsqb$E9Q zcXc^>(ktQ1?POH-wLE^*^0&{iYzVEZSN7prSL5#Y%lVAD>*clb23WHUJ>`7Z^E!E0 zu42nO33gsdWaV1a?uGF6DB9!+_=}y?#>uvz6I_3VpJB}!c+aw-Zk0#mJh?~y7S`B2 z-s@50r;uN}MV^-L%9mk-<@M~tr~BkC`2-QYThYHRg#Ro%nV8WNVyj6?@NAVc={ZAh|NWQ0Pp;sTnFo{%FA2i?bPd8 zC-1`@El1n91pb|Zzr8l1QcJ92I}5!9z2H&$@J0Cs{ApE_9+p3mXXGJy0-t#LI4u1j ze$Q$WeTcpJEWC5A+=zB)YkGjAcWRFNuF3ohfwJI2#Ed7qDV%YRmQmm^|}{!jKHc}*bsr7EH z8CG03Z3X$hC*T_A$$JnBt(r~|HXS2pqpf{Po|T*6vt@W##@{-OCRfS*XcJ@9pX#Rm zf{1hXs4`3;oKHd0Y+hOEE!RJ7^hiakBgXF!1ph)>p|)mL0g zouo0&o2-_=7Syy|V-B%5jl*wixSlhdVS92gO3Eop=HRml>NNIH@oo}+v$ePuep^IV z@^bDc1X_UUgLJ}J+CxZlh1AQLv*mHG;EZ`SEVesVmXX#^)ivXkJ2jETe!kfZWlV8~ ztlT`?*CMWadzon>%bio1^?5@(Y%J@W_+*Ud-2SZZv_uaTayE`!*HBx>3%L~%n?W6_ z6K$<}W|-9EI2=kc&z8(Go63Az7thmw_+-4)ScZA6sO`hUf6?aHHRM7_H|Z8owj!Q|4m14c$qmSdOq=fa!(@`$BF{d$4rcwD){xVJ{Ju1!kM< zVf$M_Ec7Z^It7oW8`Nfdy{eJi);aQx3PcPWJRc=e$WY^8Epcrt;ZxHB*0epNN{|`a zvOz{=F;USLtup|BjnLwLqR3}p;W$zEDUpL@V`;<}$JrTzeG{<4BFmRjrNrj@UjSRJ zLdi5N&9Rr8gP0}23nQ^1qHmbka2xriP@kj3V~-OlzSuE*yoeUmd=@VpA#BESpC0TX zORzu=W06`51^gRly>}6PD{8TU4FphYXXK6lm{s6WN_{C_6u*l{Y^gOp*O!@!C1Z5*;SSFYu@x+j7*K$kD4Q z;sfpcZz=k+opJeXK%dsTi?plZ471nwlDXQ=SzbRr8KkfBuw|I6U&{$8qMr<6G}+45 zke=7Qh5!lKMH}q5S}sgC{b$FKjOztOMntPNu^x7fG5So&W?G!C!tK#Ep~yU^(~zbf zO^T`#gX|@Nc9K;B8pen=g}B%4ba$qSykt3i-nHyD5d^`S z8QWQ#7&#^hhaeFmLJ$gy5DG$oL^!~Krx1?=+&B;k1Q#wm&YU^Hfe%1}kKhN8_!Pc@ z?%LkF664kXR@l$1?HQXVRXG8p)lhM5;-l^J+ zD{yp__lR!Kd8P`lpDC6EDklZ<>~G*Q)r{u3&sZfYZyAs90^4|tPk1H2-@lAYt$m(xh zX51=Qo5Qw`N_7`9P6}^3n2;-F>_4;Aa|A9Z+wT$OGyvGMTmn?i&qPrz= zJ4V#t67%3Z=bqo8$>*(eJD8GCv%MDDJxo3uF z2!~kRlzG*CAA0D2oEH>%yHkD*#r!T)-S4z`e3tr69inP=Ohc~VwKG$Vx@$XoOM|g& zGG_X$v8I_}ZIfyDqVFVnT;_8{%O-2Vx~a|u^!SK&)5jgXf0^QM!xSfLxlU%v2UEUv z7&p4YnR-Qvdas)*p0}(P`tBLVeZ%E*1?z^Xmexx=SYav<3_WjZIg694y!V(a3gln+ zF_Vs>!@;=epDx@?e*~wu8^KNVU$FNb!DvoKqeY7rEn2i_(V`t}`wPyWG(ceW0RR(F B7Z3md literal 0 HcmV?d00001 diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py new file mode 100644 index 00000000000..077a7519431 --- /dev/null +++ b/tests/features/test_nifti.py @@ -0,0 +1,91 @@ +## taken from: https://github.com/yarikoptic/nitest-balls1/blob/2cd07d86e2cc2d3c612d5d4d659daccd7a58f126/NIFTI/T1.nii.gz + +from pathlib import Path + +import pytest + +from datasets import Dataset, Features, Nifti +from src.datasets.features.nifti import encode_nibabel_image + +from ..utils import require_nibabel + + +@require_nibabel +@pytest.mark.parametrize("nifti_file", ["test_nifti.nii", "test_nifti.nii.gz"]) +@pytest.mark.parametrize( + "build_example", + [ + lambda nifti_path: nifti_path, + lambda nifti_path: Path(nifti_path), + lambda nifti_path: open(nifti_path, "rb").read(), + lambda nifti_path: {"path": nifti_path}, + lambda nifti_path: {"path": nifti_path, "bytes": None}, + lambda nifti_path: {"path": nifti_path, "bytes": open(nifti_path, "rb").read()}, + lambda nifti_path: {"path": None, "bytes": open(nifti_path, "rb").read()}, + lambda nifti_path: {"bytes": open(nifti_path, "rb").read()}, + ], +) +def test_nifti_feature_encode_example(shared_datadir, nifti_file, build_example): + import nibabel + + nifti_path = str(shared_datadir / nifti_file) + nifti = Nifti() + encoded_example = nifti.encode_example(build_example(nifti_path)) + assert isinstance(encoded_example, dict) + assert encoded_example.keys() == {"bytes", "path"} + assert encoded_example["bytes"] is not None or encoded_example["path"] is not None + decoded_example = nifti.decode_example(encoded_example) + assert isinstance(decoded_example, nibabel.nifti1.Nifti1Image) + + +@require_nibabel +@pytest.mark.parametrize("nifti_file", ["test_nifti.nii", "test_nifti.nii.gz"]) +def test_dataset_with_nifti_feature(shared_datadir, nifti_file): + import nibabel + + nifti_path = str(shared_datadir / nifti_file) + data = {"nifti": [nifti_path]} + features = Features({"nifti": Nifti()}) + dset = Dataset.from_dict(data, features=features) + item = dset[0] + assert item.keys() == {"nifti"} + assert isinstance(item["nifti"], nibabel.nifti1.Nifti1Image) + batch = dset[:1] + assert len(batch) == 1 + assert batch.keys() == {"nifti"} + assert isinstance(batch["nifti"], list) and all( + isinstance(item, nibabel.nifti1.Nifti1Image) for item in batch["nifti"] + ) + column = dset["nifti"] + assert len(column) == 1 + assert all(isinstance(item, nibabel.nifti1.Nifti1Image) for item in column) + + # from bytes + with open(nifti_path, "rb") as f: + data = {"nifti": [f.read()]} + dset = Dataset.from_dict(data, features=features) + item = dset[0] + assert item.keys() == {"nifti"} + assert isinstance(item["nifti"], nibabel.nifti1.Nifti1Image) + + +@require_nibabel +def test_encode_nibabel_image(shared_datadir): + import nibabel + + nifti_path = str(shared_datadir / "test_nifti.nii") + img = nibabel.load(nifti_path) + encoded_example = encode_nibabel_image(img) + nifti = Nifti() + assert isinstance(encoded_example, dict) + assert encoded_example.keys() == {"bytes", "path"} + assert encoded_example["path"] is not None and encoded_example["bytes"] is None + decoded_example = nifti.decode_example(encoded_example) + assert isinstance(decoded_example, nibabel.nifti1.Nifti1Image) + + # test bytes only + img.file_map = None + encoded_example_bytes = encode_nibabel_image(img) + assert isinstance(encoded_example_bytes, dict) + assert encoded_example_bytes["bytes"] is not None and encoded_example_bytes["path"] is None + # this cannot be converted back from bytes (yet) diff --git a/tests/utils.py b/tests/utils.py index 166bd4789c2..b796641a290 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -209,6 +209,18 @@ def require_pdfplumber(test_case): return test_case +def require_nibabel(test_case): + """ + Decorator marking a test that requires nibabel. + + These tests are skipped when nibabel isn't installed. + + """ + if not config.NIBABEL_AVAILABLE: + test_case = unittest.skip("test requires nibabel")(test_case) + return test_case + + def require_transformers(test_case): """ Decorator marking a test that requires transformers. From a7600ac361a75fc1f993a617ba46039237ee0e5f Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Fri, 24 Oct 2025 16:04:35 +0200 Subject: [PATCH 21/52] Fix random seed on shuffle and interleave_datasets (#7823) * WIP: shuffle working, interleave_ds not yet * remove debug statements * add test * update test * use recursive overwriting of generator seeds * update test description * remove debugging strings * return instances of baseexiterable instead of modifying inplace * add test to make sure multiple iterations over data are deterministic --- src/datasets/iterable_dataset.py | 47 +++++++++++++++++++++ tests/test_iterable_dataset.py | 71 ++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 7f53bc7372a..9ac842d2c22 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -169,6 +169,19 @@ def _convert_to_arrow( yield new_key, pa.Table.from_pylist(cast_to_python_objects(examples, only_1d_for_numpy=True)) +def shift_ex_examples_rngs(ex_iterable: "_BaseExamplesIterable", value: int) -> "_BaseExamplesIterable": + """We need to go through the ex_iterables recursively, create a new seed and return a new iterable, then set it to the containing ex_iterable.""" + + def set_seed_recursively(ex_iterable): + if hasattr(ex_iterable, "shift_rngs"): + ex_iterable = ex_iterable.shift_rngs(value) + if hasattr(ex_iterable, "ex_iterable"): + ex_iterable.ex_iterable = set_seed_recursively(ex_iterable.ex_iterable) + return ex_iterable + + return set_seed_recursively(ex_iterable) + + class _BaseExamplesIterable: """Base class for the examples iterable used by an IterableDataset""" @@ -283,6 +296,14 @@ def __init__( super().__init__(generate_examples_fn, kwargs) self.generator = deepcopy(generator) + def shift_rngs(self, value: int) -> "_BaseExamplesIterable": + new_seed = self.generator.bit_generator.state["state"]["state"] + value + return ShuffledDataSourcesExamplesIterable( + self.generate_examples_fn, + self.kwargs, + np.random.default_rng(seed=new_seed), + ) + def _init_state_dict(self) -> dict: self._state_dict = {"shard_idx": 0, "shard_example_idx": 0, "type": self.__class__.__name__} return self._state_dict @@ -390,6 +411,14 @@ def __init__( super().__init__(generate_tables_fn, kwargs) self.generator = deepcopy(generator) + def shift_rngs(self, value: int) -> "_BaseExamplesIterable": + new_seed = self.generator.bit_generator.state["state"]["state"] + value + return ShuffledDataSourcesArrowExamplesIterable( + self.generate_examples_fn, + self.kwargs, + np.random.default_rng(seed=new_seed), + ) + def _init_state_dict(self) -> dict: self._state_dict = {"shard_idx": 0, "shard_example_idx": 0, "type": self.__class__.__name__} return self._state_dict @@ -1031,6 +1060,15 @@ def __init__( self.generator = deepcopy(generator) self.probabilities = probabilities + def shift_rngs(self, value: int) -> "_BaseExamplesIterable": + new_seed = self.generator.bit_generator.state["state"]["state"] + value + return RandomlyCyclingMultiSourcesExamplesIterable( + ex_iterables=self.ex_iterables, + generator=np.random.default_rng(seed=new_seed), + probabilities=self.probabilities, + stopping_strategy=self.stopping_strategy, + ) + @property def is_typed(self): return self.ex_iterables[0].is_typed @@ -1628,6 +1666,14 @@ def __init__(self, ex_iterable: _BaseExamplesIterable, buffer_size: int, generat self.buffer_size = buffer_size self.generator = generator + def shift_rngs(self, value: int) -> "_BaseExamplesIterable": + new_seed = self.generator.bit_generator.state["state"]["state"] + value + return BufferShuffledExamplesIterable( + ex_iterable=self.ex_iterable, + buffer_size=self.buffer_size, + generator=np.random.default_rng(seed=new_seed), + ) + @property def is_typed(self): return self.ex_iterable.is_typed @@ -2372,6 +2418,7 @@ def _iter_pytorch(self): ex_iterable = ex_iterable.shard_data_sources( num_shards=worker_info.num_workers, index=worker_info.id, contiguous=False ) + ex_iterable = shift_ex_examples_rngs(ex_iterable=ex_iterable, value=worker_info.id) self._state_dict = { "examples_iterable": ex_iterable._init_state_dict(), "epoch": self.epoch, diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 1bca866bdf8..583f5dab51a 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -1553,6 +1553,77 @@ def test_iterable_dataset_from_hub_torch_dataloader_parallel(num_workers, tmp_pa assert len(result) == 10 +@require_torch +def test_iterable_dataset_shuffle_with_multiple_workers_different_rng(): + # GH 7567 + from torch.utils.data import DataLoader, get_worker_info + + def gen(shard): + worker_info = get_worker_info() + for i in range(100): + yield {"value": i, "worker_id": worker_info.id} + + num_workers = 20 + ds = IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers))}) + ds = ds.shuffle(buffer_size=100, seed=1234) + dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers) + + result = list(dataloader) + for single_chunk in [result[x : x + num_workers] for x in range(0, len(result), num_workers)]: + values = [item["value"] for item in single_chunk] + # This will fail with the chance 1/100 ** 20! + assert len(set(values)) != 1, "Make sure not all values are identical" + + +@require_torch +def test_iterable_dataset_interleave_dataset_with_multiple_workers(): + # GH 7567 + from torch.utils.data import DataLoader + + def gen(shard, value): + for i in range(100): + yield {"value": value} + + num_workers = 20 + ds = [ + IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i}) + for i in range(10) + ] + ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234) + dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers) + + result = list(dataloader) + for single_chunk in [result[x : x + num_workers] for x in range(0, len(result), num_workers)]: + values = [item["value"] for item in single_chunk] + assert len(set(values)) != 1, "Make sure not all values are identical" + + +@require_torch +def test_iterable_dataset_interleave_dataset_deterministic_across_iterations(): + # GH 7567 + from torch.utils.data import DataLoader + + def gen(shard, value): + for i in range(50): + yield {"value": value, "id": i} + + num_workers = 10 + ds = [ + IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i}) + for i in range(5) + ] + ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234) + dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers) + + # First iteration + first_result = list(dataloader) + + # Second iteration + second_result = list(dataloader) + + assert first_result == second_result, "Results should be identical across iterations when using same seed" + + @pytest.mark.parametrize("batch_size", [4, 5]) @pytest.mark.parametrize("drop_last_batch", [False, True]) def test_iterable_dataset_iter_batch(batch_size, drop_last_batch): From 6d985d9789d8418d27fa3c5e21dcda21d1ec091a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 24 Oct 2025 16:46:28 +0200 Subject: [PATCH 22/52] fix ci compressionfs (#7830) * fix ci compressionfs * again * style --- tests/test_filesystem.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/tests/test_filesystem.py b/tests/test_filesystem.py index aef0dfc2a89..63f627b72cc 100644 --- a/tests/test_filesystem.py +++ b/tests/test_filesystem.py @@ -1,9 +1,7 @@ -import importlib import os import fsspec import pytest -from fsspec import register_implementation from fsspec.core import url_to_fs from fsspec.registry import _registry as _fsspec_registry @@ -44,7 +42,6 @@ def test_compression_filesystems(compression_fs_class, gz_file, bz2_file, lz4_fi reason += require_zstandard.kwargs["reason"] pytest.skip(reason) fs = fsspec.filesystem(compression_fs_class.protocol, fo=input_path) - assert isinstance(fs, compression_fs_class) expected_filename = os.path.basename(input_path) expected_filename = expected_filename[: expected_filename.rindex(".")] assert fs.glob("*") == [expected_filename] @@ -61,21 +58,3 @@ def test_fs_isfile(protocol, zip_jsonl_path, jsonl_gz_path): fs, *_ = url_to_fs(path) assert fs.isfile(member_file_path) assert not fs.isfile("non_existing_" + member_file_path) - - -def test_fs_overwrites(): - protocol = "bz2" - - # Import module - import datasets.filesystems - - # Overwrite protocol and reload - register_implementation(protocol, None, clobber=True) - with pytest.warns(UserWarning) as warning_info: - importlib.reload(datasets.filesystems) - - assert len(warning_info) == 1 - assert ( - str(warning_info[0].message) - == f"A filesystem protocol was already set for {protocol} and will be overwritten." - ) From f7c8e46eca8b56183ad638d10cbb972754452817 Mon Sep 17 00:00:00 2001 From: Samuel Huang Date: Mon, 27 Oct 2025 13:08:24 -0400 Subject: [PATCH 23/52] fix: better args passthrough for `_batch_setitems()` (#7817) * update signature for _batch_setitems * arguments passthrough --- src/datasets/utils/_dill.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/datasets/utils/_dill.py b/src/datasets/utils/_dill.py index fad95f7edf5..f3a4baba681 100644 --- a/src/datasets/utils/_dill.py +++ b/src/datasets/utils/_dill.py @@ -69,9 +69,7 @@ def save(self, obj, save_persistent_id=True): obj = getattr(obj, "_torchdynamo_orig_callable", obj) dill.Pickler.save(self, obj, save_persistent_id=save_persistent_id) - def _batch_setitems(self, items): - if self._legacy_no_dict_keys_sorting: - return super()._batch_setitems(items) + def _batch_setitems(self, items, *args, **kwargs): # Ignore the order of keys in a dict try: # Faster, but fails for unorderable elements @@ -80,7 +78,7 @@ def _batch_setitems(self, items): from datasets.fingerprint import Hasher items = sorted(items, key=lambda x: Hasher.hash(x[0])) - dill.Pickler._batch_setitems(self, items) + return super()._batch_setitems(items, *args, **kwargs) def memoize(self, obj): # Don't memoize strings since two identical strings can have different Python ids From 627ed2ee5ef68d858f392550efa9330d1f53c4b6 Mon Sep 17 00:00:00 2001 From: Arthur Testard <110672812+art-test-stack@users.noreply.github.com> Date: Tue, 28 Oct 2025 16:57:32 +0100 Subject: [PATCH 24/52] Fix: Properly render [!TIP] block in stream.shuffle documentation (#7833) Adressing issue 7832 --- docs/source/stream.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/stream.mdx b/docs/source/stream.mdx index aa72faff50b..3abfbdbc3cc 100644 --- a/docs/source/stream.mdx +++ b/docs/source/stream.mdx @@ -114,6 +114,7 @@ The `buffer_size` argument controls the size of the buffer to randomly sample ex ``` > [!TIP] +> > [`IterableDataset.shuffle`] will also shuffle the order of the shards if the dataset is sharded into multiple files. ## Reshuffle From 9e5b0e6f1e2f16a304860a4dbb3bd59c1cc5b366 Mon Sep 17 00:00:00 2001 From: Arjun Jagdale <142811259+ArjunJagdale@users.noreply.github.com> Date: Tue, 28 Oct 2025 21:40:42 +0530 Subject: [PATCH 25/52] resolves the ValueError: Unable to avoid copy while creating an array (#7831) * Fix argument passing in stratified shuffle split NumPy 2.0 changed the behavior of the `copy=False` parameter to be stricter. When `train_test_split` converted Arrow arrays to NumPy format for stratification, it triggered this error for non-contiguous arrays. Using `np.asarray()` allows copying when necessary, which is the recommended migration path per NumPy 2.0 documentation. * make style --------- Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- src/datasets/arrow_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 43301d23041..dbdff64953b 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -4868,7 +4868,7 @@ def train_test_split( try: train_indices, test_indices = next( stratified_shuffle_split_generate_indices( - self.with_format("numpy")[stratify_by_column], n_train, n_test, rng=generator + np.asarray(self.with_format("numpy")[stratify_by_column]), n_train, n_test, rng=generator ) ) except Exception as error: From 8b1bd4ec1cc9e9ce022f749abb6485ef984ae7c0 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 31 Oct 2025 18:27:15 +0100 Subject: [PATCH 26/52] Python 3.14 (#7836) * add 3.14 * update ci * go home tf * torchcodec * numba * fix ci * no lz4 in python 3.14 * fix tests * again * again * again --- .github/conda/meta.yaml | 4 +-- .github/workflows/ci.yml | 16 +++++----- setup.py | 15 +++++---- tests/features/test_audio.py | 3 ++ tests/test_extract.py | 3 +- tests/test_fingerprint.py | 5 +-- tests/test_iterable_dataset.py | 39 +++++++++++++----------- tests/test_py_utils.py | 3 +- tests/test_streaming_download_manager.py | 25 +++++++++++---- tests/utils.py | 14 +++++++++ 10 files changed, 83 insertions(+), 44 deletions(-) diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml index 59a16cda78b..7263b10035a 100644 --- a/.github/conda/meta.yaml +++ b/.github/conda/meta.yaml @@ -25,7 +25,7 @@ requirements: - dataclasses - multiprocess - fsspec - - huggingface_hub >=0.24.0,<1.0.0 + - huggingface_hub >=0.25.0,<2.0.0 - packaging run: - python @@ -41,7 +41,7 @@ requirements: - dataclasses - multiprocess - fsspec - - huggingface_hub >=0.24.0,<1.0.0 + - huggingface_hub >=0.25.0,<2.0.0 - packaging test: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 128266b5e48..e40bc458d6f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,7 +82,7 @@ jobs: run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ - test_py312: + test_py314: needs: check_code_quality strategy: matrix: @@ -100,10 +100,10 @@ jobs: run: | sudo apt update sudo apt install -y ffmpeg - - name: Set up Python 3.12 + - name: Set up Python 3.14 uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.14" - name: Setup conda env (windows) if: ${{ matrix.os == 'windows-latest' }} uses: conda-incubator/setup-miniconda@v2 @@ -111,7 +111,7 @@ jobs: auto-update-conda: true miniconda-version: "latest" activate-environment: test - python-version: "3.12" + python-version: "3.14" - name: Setup FFmpeg (windows) if: ${{ matrix.os == 'windows-latest' }} run: conda install "ffmpeg=7.0.1" -c conda-forge @@ -127,7 +127,7 @@ jobs: run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ - test_py312_future: + test_py314_future: needs: check_code_quality strategy: matrix: @@ -145,10 +145,10 @@ jobs: run: | sudo apt update sudo apt install -y ffmpeg - - name: Set up Python 3.12 + - name: Set up Python 3.14 uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.14" - name: Setup conda env (windows) if: ${{ matrix.os == 'windows-latest' }} uses: conda-incubator/setup-miniconda@v2 @@ -156,7 +156,7 @@ jobs: auto-update-conda: true miniconda-version: "latest" activate-environment: test - python-version: "3.12" + python-version: "3.14" - name: Setup FFmpeg (windows) if: ${{ matrix.os == 'windows-latest' }} run: conda install "ffmpeg=7.0.1" -c conda-forge diff --git a/setup.py b/setup.py index 06eee6717c8..497ccdee360 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,7 @@ # for fast hashing "xxhash", # for better multiprocessing - "multiprocess<0.70.17", # to align with dill<0.3.9 (see above) + "multiprocess<0.70.19", # to align with dill<0.3.9 (see above) # to save datasets locally or on any filesystem # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143 "fsspec[http]>=2023.1.0,<=2025.9.0", @@ -153,12 +153,12 @@ TESTS_REQUIRE = [ # fix pip install issues for windows - "numba>=0.56.4", # to get recent versions of llvmlite for windows ci + "numba>=0.56.4; python_version < '3.14'", # to get recent versions of llvmlite for windows ci, not available on 3.14 # test dependencies "absl-py", "decorator", "joblib<1.3.0", # joblibspark doesn't support recent joblib versions - "joblibspark", + "joblibspark; python_version < '3.14'", # python 3.14 gives AttributeError: module 'ast' has no attribute 'Num' "pytest", "pytest-datadir", "pytest-xdist", @@ -169,7 +169,7 @@ "h5py", "jax>=0.3.14; sys_platform != 'win32'", "jaxlib>=0.3.14; sys_platform != 'win32'", - "lz4", + "lz4; python_version < '3.14'", # python 3.14 gives ImportError: cannot import name '_compression' from partially initialized module 'lz4.frame "moto[server]", "pyspark>=3.4", # https://issues.apache.org/jira/browse/SPARK-40991 fixed in 3.4.0 "py7zr", @@ -177,7 +177,7 @@ "sqlalchemy", "protobuf<4.0.0", # 4.0.0 breaks compatibility with tensorflow<2.12 "tensorflow>=2.6.0; python_version<'3.10' and sys_platform != 'win32'", # numpy-2 is not supported for Python < 3.10 - "tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32'", # Pins numpy < 2 + "tensorflow>=2.16.0; python_version>='3.10' and sys_platform != 'win32' and python_version < '3.14'", # Pins numpy < 2 "tiktoken", "torch>=2.8.0", "torchdata", @@ -185,7 +185,7 @@ "zstandard", "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced - "torchcodec>=0.7.0", # minium version to get windows support + "torchcodec>=0.7.0; python_version < '3.14'", # minium version to get windows support, torchcodec doesn't have wheels for 3.14 yet "nibabel>=5.3.1", ] @@ -262,6 +262,9 @@ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], keywords="datasets machine learning datasets", diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index aa5b2fcda94..aae59ea53ee 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -713,6 +713,7 @@ def test_dataset_with_audio_feature_loaded_from_cache(): assert isinstance(ds, Dataset) +@require_torchcodec def test_dataset_with_audio_feature_undecoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} @@ -730,6 +731,7 @@ def test_dataset_with_audio_feature_undecoded(shared_datadir): assert column[0] == {"path": audio_path, "bytes": None} +@require_torchcodec def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} @@ -761,6 +763,7 @@ def test_formatted_dataset_with_audio_feature_undecoded(shared_datadir): assert column[0] == {"path": audio_path, "bytes": None} +@require_torchcodec def test_dataset_with_audio_feature_map_undecoded(shared_datadir): audio_path = str(shared_datadir / "test_audio_44100.wav") data = {"audio": [audio_path]} diff --git a/tests/test_extract.py b/tests/test_extract.py index 186d65fd0ba..489e5efa586 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,5 +1,4 @@ import os -import zipfile import pytest @@ -199,5 +198,5 @@ def test_is_zipfile_false_positive(tmpdir): ) with not_a_zip_file.open("wb") as f: f.write(data) - assert zipfile.is_zipfile(str(not_a_zip_file)) # is a false positive for `zipfile` + # zipfile.is_zipfile(str(not_a_zip_file)) could be a false positive for `zipfile` assert not ZipExtractor.is_extractable(not_a_zip_file) # but we're right diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index 0b7a45458bd..e3ca7464b16 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -26,6 +26,7 @@ require_spacy, require_tiktoken, require_torch, + require_torch_compile, require_transformers, ) @@ -347,7 +348,7 @@ def test_hash_spacy_model(self): self.assertNotEqual(hash1, hash2) @require_not_windows - @require_torch + @require_torch_compile def test_hash_torch_compiled_function(self): import torch @@ -360,7 +361,7 @@ def f(x): self.assertEqual(hash1, hash2) @require_not_windows - @require_torch + @require_torch_compile def test_hash_torch_compiled_module(self): m = TorchModule() next(iter(m.parameters())).data.fill_(1.0) diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py index 583f5dab51a..bdfa60fdc01 100644 --- a/tests/test_iterable_dataset.py +++ b/tests/test_iterable_dataset.py @@ -1553,18 +1553,21 @@ def test_iterable_dataset_from_hub_torch_dataloader_parallel(num_workers, tmp_pa assert len(result) == 10 +def gen_with_worker_info(shard): + from torch.utils.data import get_worker_info + + worker_info = get_worker_info() + for i in range(100): + yield {"value": i, "worker_id": worker_info.id} + + @require_torch def test_iterable_dataset_shuffle_with_multiple_workers_different_rng(): # GH 7567 - from torch.utils.data import DataLoader, get_worker_info - - def gen(shard): - worker_info = get_worker_info() - for i in range(100): - yield {"value": i, "worker_id": worker_info.id} + from torch.utils.data import DataLoader num_workers = 20 - ds = IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers))}) + ds = IterableDataset.from_generator(gen_with_worker_info, gen_kwargs={"shard": list(range(num_workers))}) ds = ds.shuffle(buffer_size=100, seed=1234) dataloader = DataLoader(ds, batch_size=None, num_workers=num_workers) @@ -1575,18 +1578,19 @@ def gen(shard): assert len(set(values)) != 1, "Make sure not all values are identical" +def gen_with_value(shard, value): + for i in range(100): + yield {"value": value} + + @require_torch def test_iterable_dataset_interleave_dataset_with_multiple_workers(): # GH 7567 from torch.utils.data import DataLoader - def gen(shard, value): - for i in range(100): - yield {"value": value} - num_workers = 20 ds = [ - IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i}) + IterableDataset.from_generator(gen_with_value, gen_kwargs={"shard": list(range(num_workers)), "value": i}) for i in range(10) ] ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234) @@ -1598,18 +1602,19 @@ def gen(shard, value): assert len(set(values)) != 1, "Make sure not all values are identical" +def gen_with_id(shard, value): + for i in range(50): + yield {"value": value, "id": i} + + @require_torch def test_iterable_dataset_interleave_dataset_deterministic_across_iterations(): # GH 7567 from torch.utils.data import DataLoader - def gen(shard, value): - for i in range(50): - yield {"value": value, "id": i} - num_workers = 10 ds = [ - IterableDataset.from_generator(gen, gen_kwargs={"shard": list(range(num_workers)), "value": i}) + IterableDataset.from_generator(gen_with_id, gen_kwargs={"shard": list(range(num_workers)), "value": i}) for i in range(5) ] ds = interleave_datasets(ds, probabilities=[1 / len(ds)] * len(ds), seed=1234) diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py index d3e7795bf9d..aad95f74a59 100644 --- a/tests/test_py_utils.py +++ b/tests/test_py_utils.py @@ -1,4 +1,5 @@ import os +import pickle import time from dataclasses import dataclass from multiprocessing import Pool @@ -81,7 +82,7 @@ def test_map_nested(self): {k: v.tolist() for k, v in map_nested(int, sn1, map_numpy=True, num_proc=num_proc).items()}, {k: v.tolist() for k, v in expected_map_nested_sn1_int.items()}, ) - with self.assertRaises(AttributeError): # can't pickle a local lambda + with self.assertRaises((AttributeError, pickle.PicklingError)): # can't pickle a local lambda map_nested(lambda x: x + 1, sn1, num_proc=num_proc) def test_zip_dict(self): diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index d569637fdad..1fc53502ba6 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -1,5 +1,6 @@ import json import os +from pathlib import Path import pytest @@ -26,10 +27,16 @@ Bulbasaur, grass""" -@pytest.mark.parametrize("urlpath", [r"C:\\foo\bar.txt", "/foo/bar.txt", "https://f.oo/bar.txt"]) -def test_streaming_dl_manager_download_dummy_path(urlpath): +def test_streaming_dl_manager_download_dummy_path(): + path = str(Path(__file__).resolve()) dl_manager = StreamingDownloadManager() - assert dl_manager.download(urlpath) == urlpath + assert dl_manager.download(path) == path + + +def test_streaming_dl_manager_download_dummy_url(): + url = "https://f.oo/bar.txt" + dl_manager = StreamingDownloadManager() + assert dl_manager.download(url) == url @pytest.mark.parametrize( @@ -54,10 +61,16 @@ def test_streaming_dl_manager_download(text_path): assert f.read() == expected_file.read() -@pytest.mark.parametrize("urlpath", [r"C:\\foo\bar.txt", "/foo/bar.txt", "https://f.oo/bar.txt"]) -def test_streaming_dl_manager_download_and_extract_no_extraction(urlpath): +def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_path(): + path = str(Path(__file__).resolve()) + dl_manager = StreamingDownloadManager() + assert dl_manager.download_and_extract(path) == path + + +def test_streaming_dl_manager_download_and_extract_no_extraction_dummy_url(): + url = "https://f.oo/bar.txt" dl_manager = StreamingDownloadManager() - assert dl_manager.download_and_extract(urlpath) == urlpath + assert dl_manager.download_and_extract(url) == url def test_streaming_dl_manager_extract(text_gz_path, text_path): diff --git a/tests/utils.py b/tests/utils.py index b796641a290..1980cf3e257 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -125,6 +125,20 @@ def require_torch(test_case): return test_case +def require_torch_compile(test_case): + """ + Decorator marking a test that requires PyTorch. + + These tests are skipped when PyTorch isn't installed. + + """ + if not config.TORCH_AVAILABLE: + test_case = unittest.skip("test requires PyTorch")(test_case) + if config.PY_VERSION >= version.parse("3.14"): + test_case = unittest.skip("test requires torch compile which isn't available in python 3.14")(test_case) + return test_case + + def require_polars(test_case): """ Decorator marking a test that requires Polars. From 0e7c6ca73d2dd0c1aa3a610ddeb8cec76e3fe492 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Mon, 3 Nov 2025 15:24:11 +0100 Subject: [PATCH 27/52] Add num channels to audio (#7840) * WIP: add audio, tests failing * WIP: add mono argument, tests failing * change from mono to num_channels in documentation, audio tests passing * update docs and move test for audio * update audio * update docstring for audio * Apply suggestions from code review --------- Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- src/datasets/features/audio.py | 34 ++++++++++++++++++++++++---------- tests/features/test_audio.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index c9b894f6605..d9513f289f5 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -49,9 +49,13 @@ class Audio: Args: sampling_rate (`int`, *optional*): Target sampling rate. If `None`, the native sampling rate is used. - mono (`bool`, defaults to `True`): - Whether to convert the audio signal to mono by averaging samples across - channels. + num_channels (`int`, *optional*): + The desired number of channels of the samples. By default, the number of channels of the source is used. + Audio decoding will return samples with shape (num_channels, num_samples) + Currently `None` (number of channels of the source, default), `1` (mono) or `2` (stereo) channels are supported. + The `num_channels` argument is passed to `torchcodec.decoders.AudioDecoder`. + + decode (`bool`, defaults to `True`): Whether to decode the audio data. If `False`, returns the underlying dictionary in the format `{"path": audio_path, "bytes": audio_bytes}`. @@ -63,7 +67,7 @@ class Audio: ```py >>> from datasets import load_dataset, Audio >>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train") - >>> ds = ds.cast_column("audio", Audio(sampling_rate=44100)) + >>> ds = ds.cast_column("audio", Audio(sampling_rate=44100, num_channels=2)) >>> ds[0]["audio"] >>> audio = ds[0]["audio"] @@ -78,6 +82,7 @@ class Audio: sampling_rate: Optional[int] = None decode: bool = True + num_channels: Optional[int] = None stream_index: Optional[int] = None id: Optional[str] = field(default=None, repr=False) # Automatically constructed @@ -126,7 +131,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder buffer = BytesIO() AudioEncoder( torch.from_numpy(value["array"].astype(np.float32)), sample_rate=value["sampling_rate"] - ).to_file_like(buffer, format="wav") + ).to_file_like(buffer, format="wav", num_channels=self.num_channels) return {"bytes": buffer.getvalue(), "path": None} elif value.get("path") is not None and os.path.isfile(value["path"]): # we set "bytes": None to not duplicate the data if they're already available locally @@ -143,7 +148,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder buffer = BytesIO() AudioEncoder(torch.from_numpy(bytes_value), sample_rate=value["sampling_rate"]).to_file_like( - buffer, format="wav" + buffer, format="wav", num_channels=self.num_channels ) return {"bytes": buffer.getvalue(), "path": None} else: @@ -188,7 +193,9 @@ def decode_example( raise ValueError(f"An audio sample should have one of 'path' or 'bytes' but both are None in {value}.") if bytes is None and is_local_path(path): - audio = AudioDecoder(path, stream_index=self.stream_index, sample_rate=self.sampling_rate) + audio = AudioDecoder( + path, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=self.num_channels + ) elif bytes is None: token_per_repo_id = token_per_repo_id or {} @@ -201,10 +208,14 @@ def decode_example( download_config = DownloadConfig(token=token) f = xopen(path, "rb", download_config=download_config) - audio = AudioDecoder(f, stream_index=self.stream_index, sample_rate=self.sampling_rate) + audio = AudioDecoder( + f, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=self.num_channels + ) else: - audio = AudioDecoder(bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate) + audio = AudioDecoder( + bytes, stream_index=self.stream_index, sample_rate=self.sampling_rate, num_channels=self.num_channels + ) audio._hf_encoded = {"path": path, "bytes": bytes} audio.metadata.path = path return audio @@ -312,5 +323,8 @@ def encode_torchcodec_audio(audio: "AudioDecoder") -> dict: samples = audio.get_all_samples() buffer = BytesIO() - AudioEncoder(samples.data.cpu(), sample_rate=samples.sample_rate).to_file_like(buffer, format="wav") + num_channels = samples.data.shape[0] + AudioEncoder(samples.data.cpu(), sample_rate=samples.sample_rate).to_file_like( + buffer, format="wav", num_channels=num_channels + ) return {"bytes": buffer.getvalue(), "path": None} diff --git a/tests/features/test_audio.py b/tests/features/test_audio.py index aae59ea53ee..a6dbca799fe 100644 --- a/tests/features/test_audio.py +++ b/tests/features/test_audio.py @@ -789,3 +789,31 @@ def test_audio_embed_storage(shared_datadir): embedded_storage = Audio().embed_storage(storage) embedded_example = embedded_storage.to_pylist()[0] assert embedded_example == {"bytes": open(audio_path, "rb").read(), "path": "test_audio_44100.wav"} + + +@require_torchcodec +def test_audio_decode_example_opus_convert_to_stereo(shared_datadir): + # GH 7837 + from torchcodec.decoders import AudioDecoder + + audio_path = str(shared_datadir / "test_audio_48000.opus") # mono file + audio = Audio(num_channels=2) + decoded_example = audio.decode_example(audio.encode_example(audio_path)) + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 48000 + assert samples.data.shape == (2, 48000) + + +@require_torchcodec +def test_audio_decode_example_opus_convert_to_mono(shared_datadir): + # GH 7837 + from torchcodec.decoders import AudioDecoder + + audio_path = str(shared_datadir / "test_audio_44100.wav") # stereo file + audio = Audio(num_channels=1) + decoded_example = audio.decode_example(audio.encode_example(audio_path)) + assert isinstance(decoded_example, AudioDecoder) + samples = decoded_example.get_all_samples() + assert samples.sample_rate == 44100 + assert samples.data.shape == (1, 202311) From 03c16eceb7837bcd726444c4bee0705b53279862 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Mon, 3 Nov 2025 15:34:12 +0100 Subject: [PATCH 28/52] fix column with transform (#7843) --- src/datasets/arrow_dataset.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index dbdff64953b..250442a0ed8 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -660,7 +660,11 @@ def __init__(self, source: Union["Dataset", "Column"], column_name: str): def __iter__(self) -> Iterator[Any]: if isinstance(self.source, Dataset): - source = self.source._fast_select_column(self.column_name) + if self.source._format_type == "custom": + # the formatting transform may require all columns + source = self.source + else: + source = self.source._fast_select_column(self.column_name) else: source = self.source for example in source: @@ -670,7 +674,12 @@ def __getitem__(self, key: Union[int, str, list[int]]) -> Any: if isinstance(key, str): return Column(self, key) elif isinstance(self.source, Dataset): - return self.source._fast_select_column(self.column_name)[key][self.column_name] + if self.source._format_type == "custom": + # the formatting transform may require all columns + source = self.source + else: + source = self.source._fast_select_column(self.column_name) + return source[key][self.column_name] elif isinstance(key, int): return self.source[key][self.column_name] else: From fc7f97c442b3985e47fcc1b142f41df040963213 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Mon, 3 Nov 2025 15:51:32 +0100 Subject: [PATCH 29/52] support fsspec 2025.10.0 (#7844) fsspec 2025.10.0 --- setup.py | 2 +- src/datasets/data_files.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 497ccdee360..ecec465d44b 100644 --- a/setup.py +++ b/setup.py @@ -127,7 +127,7 @@ "multiprocess<0.70.19", # to align with dill<0.3.9 (see above) # to save datasets locally or on any filesystem # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143 - "fsspec[http]>=2023.1.0,<=2025.9.0", + "fsspec[http]>=2023.1.0,<=2025.10.0", # To get datasets from the Datasets Hub on huggingface.co "huggingface-hub>=0.25.0,<2.0", # Utilities from PyPA to e.g., compare versions diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 9710bc84a8e..96d4daea52d 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -349,14 +349,18 @@ def resolve_pattern( pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config) fs, fs_pattern = url_to_fs(pattern, **storage_options) files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)} - protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0] + protocol = ( + pattern.split("://")[0] + if "://" in pattern + else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]) + ) protocol_prefix = protocol + "://" if protocol != "file" else "" glob_kwargs = {} if protocol == "hf": # 10 times faster glob with detail=True (ignores costly info like lastCommit) glob_kwargs["expand_info"] = False matched_paths = [ - filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath + filepath if "://" in filepath else protocol_prefix + filepath for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items() if (info["type"] == "file" or (info.get("islink") and os.path.isfile(os.path.realpath(filepath)))) and (xbasename(filepath) not in files_to_ignore) From 232cb10501b48c6c6ce7131d8b8177d322e7ad97 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 4 Nov 2025 11:36:37 +0100 Subject: [PATCH 30/52] Release: 4.4.0 (#7845) release: 4.4.0 --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ecec465d44b..fb8130fd8c2 100644 --- a/setup.py +++ b/setup.py @@ -232,7 +232,7 @@ setup( name="datasets", - version="4.3.1.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.4.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index afa6bf0c9c0..38249bbd52b 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.3.1.dev0" +__version__ = "4.4.0" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From 5cb2925f649fecc4e1ebe488b5a1de2effcf84c0 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 4 Nov 2025 11:44:36 +0100 Subject: [PATCH 31/52] set dev version (#7846) --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index fb8130fd8c2..f53d4cd85f5 100644 --- a/setup.py +++ b/setup.py @@ -232,7 +232,7 @@ setup( name="datasets", - version="4.4.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.4.1.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 38249bbd52b..b35961d27bb 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.4.0" +__version__ = "4.4.1.dev0" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From f2f58b35f6bb230f1c3415aa83587f6d9bf7edb5 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:52:22 +0100 Subject: [PATCH 32/52] Better streaming retries (504 and 429) (#7847) better streaming retries --- src/datasets/config.py | 2 ++ src/datasets/utils/file_utils.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/datasets/config.py b/src/datasets/config.py index 3d3f12b008d..b6412682727 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -248,6 +248,8 @@ # Streaming STREAMING_READ_MAX_RETRIES = 20 STREAMING_READ_RETRY_INTERVAL = 5 +STREAMING_READ_SERVER_UNAVAILABLE_RETRY_INTERVAL = 20 +STREAMING_READ_RATE_LIMIT_RETRY_INTERVAL = 60 STREAMING_OPEN_MAX_RETRIES = 20 STREAMING_OPEN_RETRY_INTERVAL = 5 diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 37d79640d3c..b57a3784547 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -68,6 +68,8 @@ class _AiohttpClientError(Exception): requests.exceptions.Timeout, httpx.RequestError, ) +SERVER_UNAVAILABLE_CODE = 504 +RATE_LIMIT_CODE = 429 def is_remote_url(url_or_filename: str) -> bool: @@ -827,6 +829,22 @@ def read_with_retries(*args, **kwargs): f"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_RETRY_INTERVAL}sec [{retry}/{max_retries}]" ) time.sleep(config.STREAMING_READ_RETRY_INTERVAL) + except huggingface_hub.errors.HfHubHTTPError as err: + if err.response is not None and err.response.status_code == SERVER_UNAVAILABLE_CODE: + disconnect_err = err + logger.warning( + f"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_SERVER_UNAVAILABLE_RETRY_INTERVAL}sec [{retry}/{max_retries}]" + ) + time.sleep(config.STREAMING_READ_SERVER_UNAVAILABLE_RETRY_INTERVAL) + elif err.response is not None and err.response.status_code == RATE_LIMIT_CODE: + disconnect_err = err + logger.warning(str(err)) + logger.warning( + f"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_RATE_LIMIT_RETRY_INTERVAL}sec [{retry}/{max_retries}]" + ) + time.sleep(config.STREAMING_READ_RATE_LIMIT_RETRY_INTERVAL) + else: + raise else: raise ConnectionError("Server Disconnected") from disconnect_err return out From d32a1f7042a43dff2b5e2541c22f2dcbd3c75c72 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Wed, 5 Nov 2025 15:04:03 +0100 Subject: [PATCH 33/52] DOC: remove mode parameter in docstring of pdf and video feature (#7848) remove mode parameter in docstring of pdf and video feature --- src/datasets/features/pdf.py | 2 -- src/datasets/features/video.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/datasets/features/pdf.py b/src/datasets/features/pdf.py index 414c497356c..756530554d4 100644 --- a/src/datasets/features/pdf.py +++ b/src/datasets/features/pdf.py @@ -44,8 +44,6 @@ class Pdf: - A `pdfplumber.pdf.PDF`: pdfplumber pdf object. Args: - mode (`str`, *optional*): - The mode to convert the pdf to. If `None`, the native mode of the pdf is used. decode (`bool`, defaults to `True`): Whether to decode the pdf data. If `False`, returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`. diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py index adbfaaa30f3..8d7f3e3be51 100644 --- a/src/datasets/features/video.py +++ b/src/datasets/features/video.py @@ -45,8 +45,6 @@ class Video: Output: The Video features output data as `torchcodec.decoders.VideoDecoder` objects. Args: - mode (`str`, *optional*): - The mode to convert the video to. If `None`, the native mode of the video is used. decode (`bool`, defaults to `True`): Whether to decode the video data. If `False`, returns the underlying dictionary in the format `{"path": video_path, "bytes": video_bytes}`. From 6a6983a9d1a8f93307fb11157001c74390c79060 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:00:45 +0100 Subject: [PATCH 34/52] release: 4.4.1 (#7849) --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index f53d4cd85f5..360ac8742f7 100644 --- a/setup.py +++ b/setup.py @@ -232,7 +232,7 @@ setup( name="datasets", - version="4.4.1.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.4.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index b35961d27bb..268d13ad6dc 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.4.1.dev0" +__version__ = "4.4.1" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From 91f96a0d9b2b95693343e7b20bdc9b4a0b2eb559 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:02:32 +0100 Subject: [PATCH 35/52] dev version (#7850) --- setup.py | 2 +- src/datasets/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 360ac8742f7..2f626763113 100644 --- a/setup.py +++ b/setup.py @@ -232,7 +232,7 @@ setup( name="datasets", - version="4.4.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.4.2.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="HuggingFace community-driven open-source library of datasets", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py index 268d13ad6dc..6b2dc7d8600 100644 --- a/src/datasets/__init__.py +++ b/src/datasets/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "4.4.1" +__version__ = "4.4.2.dev0" from .arrow_dataset import Column, Dataset from .arrow_reader import ReadInstruction From 3356d748ee4b34706db95e4074e240f8fd1a287f Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Thu, 6 Nov 2025 17:20:36 +0100 Subject: [PATCH 36/52] Fix embed storage nifti (#7853) * WIP: allow uploading of nifti * remove debug statements and fix test * remove debug statements * remove debug statements --- src/datasets/features/nifti.py | 57 ++++++++++++++++++++++++++++------ tests/features/test_nifti.py | 41 +++++++++++++++++++++++- 2 files changed, 88 insertions(+), 10 deletions(-) diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py index bac91e2af4b..f63b8cf6aa1 100644 --- a/src/datasets/features/nifti.py +++ b/src/datasets/features/nifti.py @@ -10,7 +10,7 @@ from ..download.download_config import DownloadConfig from ..table import array_cast from ..utils.file_utils import is_local_path, xopen -from ..utils.py_utils import string_to_dict +from ..utils.py_utils import no_op_if_value_is_null, string_to_dict if TYPE_CHECKING: @@ -125,9 +125,6 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif Returns: `nibabel.Nifti1Image` objects """ - if not self.decode: - raise NotImplementedError("Decoding is disabled for this feature. Please use Nifti(decode=True) instead.") - if config.NIBABEL_AVAILABLE: import nibabel as nib else: @@ -141,6 +138,9 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif if path is None: raise ValueError(f"A nifti should have one of 'path' or 'bytes' but both are None in {value}.") else: + # gzipped files have the structure: 'gzip://T1.nii::' + if path.startswith("gzip://") and is_local_path(path.split("::")[-1]): + path = path.split("::")[-1] if is_local_path(path): nifti = nib.load(path) else: @@ -150,11 +150,10 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL ) - try: - repo_id = string_to_dict(source_url, pattern)["repo_id"] - token = token_per_repo_id.get(repo_id) - except ValueError: - token = None + source_url_fields = string_to_dict(source_url, pattern) + token = ( + token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None + ) download_config = DownloadConfig(token=token) with xopen(path, "rb", download_config=download_config) as f: nifti = nib.load(f) @@ -172,6 +171,46 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif return nifti + def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray: + """Embed NifTI files into the Arrow array. + + Args: + storage (`pa.StructArray`): + PyArrow array to embed. + + Returns: + `pa.StructArray`: Array in the NifTI arrow storage type, that is + `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. + """ + if token_per_repo_id is None: + token_per_repo_id = {} + + @no_op_if_value_is_null + def path_to_bytes(path): + source_url = path.split("::")[-1] + pattern = ( + config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL + ) + source_url_fields = string_to_dict(source_url, pattern) + token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None + download_config = DownloadConfig(token=token) + with xopen(path, "rb", download_config=download_config) as f: + return f.read() + + bytes_array = pa.array( + [ + (path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None + for x in storage.to_pylist() + ], + type=pa.binary(), + ) + path_array = pa.array( + [os.path.basename(path) if path is not None else None for path in storage.field("path").to_pylist()], + type=pa.string(), + ) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null()) + return array_cast(storage, self.pa_type) + def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]: """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.""" from .features import Value diff --git a/tests/features/test_nifti.py b/tests/features/test_nifti.py index 077a7519431..b5f0be42f3e 100644 --- a/tests/features/test_nifti.py +++ b/tests/features/test_nifti.py @@ -2,9 +2,10 @@ from pathlib import Path +import pyarrow as pa import pytest -from datasets import Dataset, Features, Nifti +from datasets import Dataset, Features, Nifti, load_dataset from src.datasets.features.nifti import encode_nibabel_image from ..utils import require_nibabel @@ -89,3 +90,41 @@ def test_encode_nibabel_image(shared_datadir): assert isinstance(encoded_example_bytes, dict) assert encoded_example_bytes["bytes"] is not None and encoded_example_bytes["path"] is None # this cannot be converted back from bytes (yet) + + +@require_nibabel +def test_embed_storage(shared_datadir): + from io import BytesIO + + import nibabel as nib + + nifti_path = str(shared_datadir / "test_nifti.nii") + img = nib.load(nifti_path) + nifti = Nifti() + + bytes_array = pa.array([None], type=pa.binary()) + path_array = pa.array([nifti_path], type=pa.string()) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"]) + + embedded_storage = nifti.embed_storage(storage) + + embedded_bytes = embedded_storage[0]["bytes"].as_py() + + bio = BytesIO(embedded_bytes) + fh = nib.FileHolder(fileobj=bio) + nifti_img = nib.Nifti1Image.from_file_map({"header": fh, "image": fh}) + + assert embedded_bytes is not None + assert nifti_img.header == img.header + assert (nifti_img.affine == img.affine).all() + assert (nifti_img.get_fdata() == img.get_fdata()).all() + + +@require_nibabel +def test_load_zipped_file_locally(shared_datadir): + import nibabel as nib + + nifti_path = str(shared_datadir / "test_nifti.nii.gz") + + ds = load_dataset("niftifolder", data_files=nifti_path) + assert isinstance(ds["train"][0]["nifti"], nib.nifti1.Nifti1Image) From cf647abeffe20ead2e45ba23e9cba72058e12ad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Mon, 10 Nov 2025 08:01:12 -0700 Subject: [PATCH 37/52] ArXiv -> HF Papers (#7855) Change arxiv to hg papers --- CONTRIBUTING.md | 2 +- README.md | 2 +- docs/source/dataset_card.mdx | 2 +- templates/README_guide.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f1f022b6fd7..3ae44bd4efc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -120,7 +120,7 @@ If you are a **dataset author**... you know what to do, it is your dataset after If you are a **user of a dataset**, the main source of information should be the dataset paper if it is available: we recommend pulling information from there into the relevant paragraphs of the template. We also eagerly welcome discussions on the [Considerations for Using the Data](https://github.com/huggingface/datasets/blob/main/templates/README_guide.md#considerations-for-using-the-data) based on existing scholarship or personal experience that would benefit the whole community. -Finally, if you want more information on the how and why of dataset cards, we strongly recommend reading the foundational works [Datasheets for Datasets](https://arxiv.org/abs/1803.09010) and [Data Statements for NLP](https://www.aclweb.org/anthology/Q18-1041/). +Finally, if you want more information on the how and why of dataset cards, we strongly recommend reading the foundational works [Datasheets for Datasets](https://huggingface.co/papers/1803.09010) and [Data Statements for NLP](https://www.aclweb.org/anthology/Q18-1041/). Thank you for your contribution! diff --git a/README.md b/README.md index d4162b9e761..0b70a39d098 100644 --- a/README.md +++ b/README.md @@ -136,7 +136,7 @@ If you're a dataset owner and wish to update any part of it (description, citati ## BibTeX -If you want to cite our 🤗 Datasets library, you can use our [paper](https://arxiv.org/abs/2109.02846): +If you want to cite our 🤗 Datasets library, you can use our [paper](https://huggingface.co/papers/2109.02846): ```bibtex @inproceedings{lhoest-etal-2021-datasets, diff --git a/docs/source/dataset_card.mdx b/docs/source/dataset_card.mdx index f1067697fb2..3cd77e1fc00 100644 --- a/docs/source/dataset_card.mdx +++ b/docs/source/dataset_card.mdx @@ -1,7 +1,7 @@ # Create a dataset card Each dataset should have a dataset card to promote responsible usage and inform users of any potential biases within the dataset. -This idea was inspired by the Model Cards proposed by [Mitchell, 2018](https://arxiv.org/abs/1810.03993). +This idea was inspired by the Model Cards proposed by [Mitchell, 2018](https://huggingface.co/papers/1810.03993). Dataset cards help users understand a dataset's contents, the context for using the dataset, how it was created, and any other considerations a user should be aware of. Creating a dataset card is easy and can be done in just a few steps: diff --git a/templates/README_guide.md b/templates/README_guide.md index 8be42708543..d8e7173c84f 100644 --- a/templates/README_guide.md +++ b/templates/README_guide.md @@ -163,7 +163,7 @@ Also describe in this section if the proposed dataset contains a low-resource or Provide descriptions of specific biases that are likely to be reflected in the data, and state whether any steps were taken to reduce their impact. -For Wikipedia text, see for example [Dinan et al 2020 on biases in Wikipedia (esp. Table 1)](https://arxiv.org/abs/2005.00614), or [Blodgett et al 2020](https://www.aclweb.org/anthology/2020.acl-main.485/) for a more general discussion of the topic. +For Wikipedia text, see for example [Dinan et al 2020 on biases in Wikipedia (esp. Table 1)](https://huggingface.co/papers/2005.00614), or [Blodgett et al 2020](https://www.aclweb.org/anthology/2020.acl-main.485/) for a more general discussion of the topic. If analyses have been run quantifying these biases, please add brief summaries and links to the studies here. From 17f40a318a1f8c7d33c2a4dd17934f81d14a7f57 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 10 Nov 2025 18:11:05 +0100 Subject: [PATCH 38/52] fix some broken links (#7859) * fix some broken links * some more --------- Co-authored-by: Quentin Lhoest --- docs/source/dataset_card.mdx | 2 +- docs/source/faiss_es.mdx | 4 ++-- docs/source/image_load.mdx | 4 ++-- docs/source/loading.mdx | 2 +- docs/source/object_detection.mdx | 4 ++-- docs/source/quickstart.mdx | 2 +- docs/source/stream.mdx | 4 ++-- docs/source/use_with_jax.mdx | 4 ++-- docs/source/use_with_numpy.mdx | 2 +- src/datasets/arrow_dataset.py | 10 ++++---- src/datasets/arrow_reader.py | 24 +++++++++---------- src/datasets/dataset_dict.py | 2 +- src/datasets/download/download_manager.py | 2 +- .../download/streaming_download_manager.py | 2 +- src/datasets/iterable_dataset.py | 2 +- src/datasets/utils/patching.py | 2 +- tests/test_metadata_util.py | 12 +++++----- 17 files changed, 42 insertions(+), 42 deletions(-) diff --git a/docs/source/dataset_card.mdx b/docs/source/dataset_card.mdx index 3cd77e1fc00..5f8b998cc9e 100644 --- a/docs/source/dataset_card.mdx +++ b/docs/source/dataset_card.mdx @@ -24,4 +24,4 @@ Creating a dataset card is easy and can be done in just a few steps: YAML also allows you to customize the way your dataset is loaded by [defining splits and/or configurations](./repository_structure#define-your-splits-and-subsets-in-yaml) without the need to write any code. -Feel free to take a look at the [SNLI](https://huggingface.co/datasets/snli), [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail), and [Allociné](https://huggingface.co/datasets/allocine) dataset cards as examples to help you get started. +Feel free to take a look at the [SNLI](https://huggingface.co/datasets/stanfordnlp/snli), [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail), and [Allociné](https://huggingface.co/datasets/tblard/allocine) dataset cards as examples to help you get started. diff --git a/docs/source/faiss_es.mdx b/docs/source/faiss_es.mdx index 9ee6565df94..635051744de 100644 --- a/docs/source/faiss_es.mdx +++ b/docs/source/faiss_es.mdx @@ -22,7 +22,7 @@ FAISS retrieves documents based on the similarity of their vector representation ```py >>> from datasets import load_dataset ->>> ds = load_dataset('crime_and_punish', split='train[:100]') +>>> ds = load_dataset('community-datasets/crime_and_punish', split='train[:100]') >>> ds_with_embeddings = ds.map(lambda example: {'embeddings': ctx_encoder(**ctx_tokenizer(example["line"], return_tensors="pt"))[0][0].numpy()}) ``` @@ -62,7 +62,7 @@ FAISS retrieves documents based on the similarity of their vector representation 7. Reload it at a later time with [`Dataset.load_faiss_index`]: ```py ->>> ds = load_dataset('crime_and_punish', split='train[:100]') +>>> ds = load_dataset('community-datasets/crime_and_punish', split='train[:100]') >>> ds.load_faiss_index('embeddings', 'my_index.faiss') ``` diff --git a/docs/source/image_load.mdx b/docs/source/image_load.mdx index 676b3f51653..67c7eff9684 100644 --- a/docs/source/image_load.mdx +++ b/docs/source/image_load.mdx @@ -10,7 +10,7 @@ When you load an image dataset and call the image column, the images are decoded ```py >>> from datasets import load_dataset, Image ->>> dataset = load_dataset("beans", split="train") +>>> dataset = load_dataset("AI-Lab-Makerere/beans", split="train") >>> dataset[0]["image"] ``` @@ -33,7 +33,7 @@ You can load a dataset from the image path. Use the [`~Dataset.cast_column`] fun If you only want to load the underlying path to the image dataset without decoding the image object, set `decode=False` in the [`Image`] feature: ```py ->>> dataset = load_dataset("beans", split="train").cast_column("image", Image(decode=False)) +>>> dataset = load_dataset("AI-Lab-Makerere/beans", split="train").cast_column("image", Image(decode=False)) >>> dataset[0]["image"] {'bytes': None, 'path': '/root/.cache/huggingface/datasets/downloads/extracted/b0a21163f78769a2cf11f58dfc767fb458fc7cea5c05dccc0144a2c0f0bc1292/train/bean_rust/bean_rust_train.29.jpg'} diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index eb73ab84b5a..74e3a8e383d 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -327,7 +327,7 @@ Select specific rows of the `train` split: ```py >>> train_10_20_ds = datasets.load_dataset("ajibawa-2023/General-Stories-Collection", split="train[10:20]") ===STRINGAPI-READINSTRUCTION-SPLIT=== ->>> train_10_20_ds = datasets.load_dataset("bookcorpu", split=datasets.ReadInstruction("train", from_=10, to=20, unit="abs")) +>>> train_10_20_ds = datasets.load_dataset("rojagtap/bookcorpus", split=datasets.ReadInstruction("train", from_=10, to=20, unit="abs")) ``` Or select a percentage of a split with: diff --git a/docs/source/object_detection.mdx b/docs/source/object_detection.mdx index f612de28fdc..f1360e3fa95 100644 --- a/docs/source/object_detection.mdx +++ b/docs/source/object_detection.mdx @@ -8,14 +8,14 @@ To run these examples, make sure you have up-to-date versions of [albumentations pip install -U albumentations opencv-python ``` -In this example, you'll use the [`cppe-5`](https://huggingface.co/datasets/cppe-5) dataset for identifying medical personal protective equipment (PPE) in the context of the COVID-19 pandemic. +In this example, you'll use the [`cppe-5`](https://huggingface.co/datasets/rishitdagli/cppe-5) dataset for identifying medical personal protective equipment (PPE) in the context of the COVID-19 pandemic. Load the dataset and take a look at an example: ```py >>> from datasets import load_dataset ->>> ds = load_dataset("cppe-5") +>>> ds = load_dataset("rishitdagli/cppe-5") >>> example = ds['train'][0] >>> example {'height': 663, diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 6be8bee907c..a6f2dc25bef 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -288,7 +288,7 @@ pip install -U albumentations opencv-python ## NLP -Text needs to be tokenized into individual tokens by a [tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer). For the quickstart, you'll load the [Microsoft Research Paraphrase Corpus (MRPC)](https://huggingface.co/datasets/glue/viewer/mrpc) training dataset to train a model to determine whether a pair of sentences mean the same thing. +Text needs to be tokenized into individual tokens by a [tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer). For the quickstart, you'll load the [Microsoft Research Paraphrase Corpus (MRPC)](https://huggingface.co/datasets/nyu-mll/glue/viewer/mrpc) training dataset to train a model to determine whether a pair of sentences mean the same thing. **1**. Load the MRPC dataset by providing the [`load_dataset`] function with the dataset name, dataset configuration (not all datasets will have a configuration), and dataset split: diff --git a/docs/source/stream.mdx b/docs/source/stream.mdx index 3abfbdbc3cc..b721b0959c4 100644 --- a/docs/source/stream.mdx +++ b/docs/source/stream.mdx @@ -160,11 +160,11 @@ You can split your dataset one of two ways: 🤗 Datasets supports sharding to divide a very large dataset into a predefined number of chunks. Specify the `num_shards` parameter in [`~IterableDataset.shard`] to determine the number of shards to split the dataset into. You'll also need to provide the shard you want to return with the `index` parameter. -For example, the [amazon_polarity](https://huggingface.co/datasets/amazon_polarity) dataset has 4 shards (in this case they are 4 Parquet files): +For example, the [amazon_polarity](https://huggingface.co/datasets/fancyzhx/amazon_polarity) dataset has 4 shards (in this case they are 4 Parquet files): ```py >>> from datasets import load_dataset ->>> dataset = load_dataset("amazon_polarity", split="train", streaming=True) +>>> dataset = load_dataset("fancyzhx/amazon_polarity", split="train", streaming=True) >>> print(dataset) IterableDataset({ features: ['label', 'title', 'content'], diff --git a/docs/source/use_with_jax.mdx b/docs/source/use_with_jax.mdx index a38dc7928ad..cb0a763ab7c 100644 --- a/docs/source/use_with_jax.mdx +++ b/docs/source/use_with_jax.mdx @@ -195,11 +195,11 @@ part. The easiest way to get JAX arrays out of a dataset is to use the `with_format('jax')` method. Lets assume that we want to train a neural network on the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) available -at the HuggingFace Hub at https://huggingface.co/datasets/mnist. +at the HuggingFace Hub at https://huggingface.co/datasets/ylecun/mnist. ```py >>> from datasets import load_dataset ->>> ds = load_dataset("mnist") +>>> ds = load_dataset("ylecun/mnist") >>> ds = ds.with_format("jax") >>> ds["train"][0] {'image': DeviceArray([[ 0, 0, 0, ...], diff --git a/docs/source/use_with_numpy.mdx b/docs/source/use_with_numpy.mdx index bd0cd6877b7..b3ba45864e8 100644 --- a/docs/source/use_with_numpy.mdx +++ b/docs/source/use_with_numpy.mdx @@ -160,7 +160,7 @@ at the HuggingFace Hub at https://huggingface.co/datasets/mnist. ```py >>> from datasets import load_dataset ->>> ds = load_dataset("mnist") +>>> ds = load_dataset("ylecun/mnist") >>> ds = ds.with_format("numpy") >>> ds["train"][0] {'image': array([[ 0, 0, 0, ...], diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 250442a0ed8..36b744a024a 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1970,7 +1970,7 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data ```py >>> from datasets import load_dataset - >>> ds = load_dataset("boolq", split="validation") + >>> ds = load_dataset("google/boolq", split="validation") >>> ds.features {'answer': Value('bool'), 'passage': Value('string'), @@ -4725,7 +4725,7 @@ def train_test_split( >>> ds = ds.train_test_split(test_size=0.2, seed=42) # stratified split - >>> ds = load_dataset("imdb",split="train") + >>> ds = load_dataset("stanfordnlp/imdb",split="train") Dataset({ features: ['text', 'label'], num_rows: 25000 @@ -6175,7 +6175,7 @@ def add_faiss_index( Example: ```python - >>> ds = datasets.load_dataset('crime_and_punish', split='train') + >>> ds = datasets.load_dataset('community-datasets/crime_and_punish', split='train') >>> ds_with_embeddings = ds.map(lambda example: {'embeddings': embed(example['line']})) >>> ds_with_embeddings.add_faiss_index(column='embeddings') >>> # query @@ -6183,7 +6183,7 @@ def add_faiss_index( >>> # save index >>> ds_with_embeddings.save_faiss_index('embeddings', 'my_index.faiss') - >>> ds = datasets.load_dataset('crime_and_punish', split='train') + >>> ds = datasets.load_dataset('community-datasets/crime_and_punish', split='train') >>> # load index >>> ds.load_faiss_index('embeddings', 'my_index.faiss') >>> # query @@ -6314,7 +6314,7 @@ def add_elasticsearch_index( ```python >>> es_client = elasticsearch.Elasticsearch() - >>> ds = datasets.load_dataset('crime_and_punish', split='train') + >>> ds = datasets.load_dataset('community-datasets/crime_and_punish', split='train') >>> ds.add_elasticsearch_index(column='line', es_client=es_client, es_index_name="my_es_index") >>> scores, retrieved_examples = ds.get_nearest_examples('line', 'my new query', k=10) ``` diff --git a/src/datasets/arrow_reader.py b/src/datasets/arrow_reader.py index 3bbb58a59c3..d9cf2cf0f4b 100644 --- a/src/datasets/arrow_reader.py +++ b/src/datasets/arrow_reader.py @@ -459,34 +459,34 @@ class ReadInstruction: Examples:: # The following lines are equivalent: - ds = datasets.load_dataset('mnist', split='test[:33%]') - ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]')) - ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%')) - ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction( + ds = datasets.load_dataset('ylecun/mnist', split='test[:33%]') + ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction.from_spec('test[:33%]')) + ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction('test', to=33, unit='%')) + ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction( 'test', from_=0, to=33, unit='%')) # The following lines are equivalent: - ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]') - ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec( + ds = datasets.load_dataset('ylecun/mnist', split='test[:33%]+train[1:-1]') + ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction.from_spec( 'test[:33%]+train[1:-1]')) - ds = datasets.load_dataset('mnist', split=( + ds = datasets.load_dataset('ylecun/mnist', split=( datasets.ReadInstruction('test', to=33, unit='%') + datasets.ReadInstruction('train', from_=1, to=-1, unit='abs'))) # The following lines are equivalent: - ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)') - ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec( + ds = datasets.load_dataset('ylecun/mnist', split='test[:33%](pct1_dropremainder)') + ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction.from_spec( 'test[:33%](pct1_dropremainder)')) - ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction( + ds = datasets.load_dataset('ylecun/mnist', split=datasets.ReadInstruction( 'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder")) # 10-fold validation: tests = datasets.load_dataset( - 'mnist', + 'ylecun/mnist', [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%') for k in range(0, 100, 10)]) trains = datasets.load_dataset( - 'mnist', + 'ylecun/mnist', [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%') for k in range(0, 100, 10)]) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 63a93429c45..995103d26e0 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -515,7 +515,7 @@ def class_encode_column(self, column: str, include_nulls: bool = False) -> "Data ```py >>> from datasets import load_dataset - >>> ds = load_dataset("boolq") + >>> ds = load_dataset("google/boolq") >>> ds["train"].features {'answer': Value('bool'), 'passage': Value('string'), diff --git a/src/datasets/download/download_manager.py b/src/datasets/download/download_manager.py index b6ee1d28e2b..4e84d9947fe 100644 --- a/src/datasets/download/download_manager.py +++ b/src/datasets/download/download_manager.py @@ -269,7 +269,7 @@ def iter_files(self, paths: Union[str, list[str]]): Example: ```py - >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip') + >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/train.zip') >>> files = dl_manager.iter_files(files) ``` """ diff --git a/src/datasets/download/streaming_download_manager.py b/src/datasets/download/streaming_download_manager.py index ff2dc1a64bd..6f4c6087027 100644 --- a/src/datasets/download/streaming_download_manager.py +++ b/src/datasets/download/streaming_download_manager.py @@ -206,7 +206,7 @@ def iter_files(self, urlpaths: Union[str, list[str]]) -> Iterable[str]: Example: ```py - >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip') + >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/train.zip') >>> files = dl_manager.iter_files(files) ``` """ diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 9ac842d2c22..26c35a60555 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -3218,7 +3218,7 @@ def shard( ```py >>> from datasets import load_dataset - >>> ds = load_dataset("amazon_polarity", split="train", streaming=True) + >>> ds = load_dataset("fancyzhx/amazon_polarity", split="train", streaming=True) >>> ds Dataset({ features: ['label', 'title', 'content'], diff --git a/src/datasets/utils/patching.py b/src/datasets/utils/patching.py index f245cabd970..69563f562e4 100644 --- a/src/datasets/utils/patching.py +++ b/src/datasets/utils/patching.py @@ -28,7 +28,7 @@ class patch_submodule: >>> from datasets.load import dataset_module_factory >>> from datasets.streaming import patch_submodule, xjoin >>> - >>> dataset_module = dataset_module_factory("snli") + >>> dataset_module = dataset_module_factory("stanfordnlp/snli") >>> snli_module = importlib.import_module(dataset_module.module_path) >>> patcher = patch_submodule(snli_module, "os.path.join", xjoin) >>> patcher.start() diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py index b6b45e1812f..cf9111fa6d9 100644 --- a/tests/test_metadata_util.py +++ b/tests/test_metadata_util.py @@ -282,7 +282,7 @@ def test_split_order_in_metadata_configs_from_exported_parquet_files_and_dataset "dataset": "AI-Lab-Makerere/beans", "config": "default", "split": "test", - "url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet", + "url": "https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet", "filename": "0000.parquet", "size": 17707203, }, @@ -290,7 +290,7 @@ def test_split_order_in_metadata_configs_from_exported_parquet_files_and_dataset "dataset": "AI-Lab-Makerere/beans", "config": "default", "split": "train", - "url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet", + "url": "https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet", "filename": "0000.parquet", "size": 143780164, }, @@ -298,7 +298,7 @@ def test_split_order_in_metadata_configs_from_exported_parquet_files_and_dataset "dataset": "AI-Lab-Makerere/beans", "config": "default", "split": "validation", - "url": "https://huggingface.co/datasets/beans/resolve/refs%2Fconvert%2Fparquet/default/validation/0000.parquet", + "url": "https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/refs%2Fconvert%2Fparquet/default/validation/0000.parquet", "filename": "0000.parquet", "size": 18500862, }, @@ -332,15 +332,15 @@ def test_split_order_in_metadata_configs_from_exported_parquet_files_and_dataset }, }, download_checksums={ - "https://huggingface.co/datasets/beans/resolve/main/data/train.zip": { + "https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/train.zip": { "num_bytes": 143812152, "checksum": None, }, - "https://huggingface.co/datasets/beans/resolve/main/data/validation.zip": { + "https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/validation.zip": { "num_bytes": 18504213, "checksum": None, }, - "https://huggingface.co/datasets/beans/resolve/main/data/test.zip": { + "https://huggingface.co/datasets/AI-Lab-Makerere/beans/resolve/main/data/test.zip": { "num_bytes": 17708541, "checksum": None, }, From c97e757836d16d4083ae057b03e22747c2ffe477 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Fri, 21 Nov 2025 13:31:18 +0100 Subject: [PATCH 39/52] Nifti visualization support (#7874) * WIP: nifti vis working, now improve * seems to work fine, tests not there yet * remove uncommented lines --- src/datasets/features/nifti.py | 67 +++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py index f63b8cf6aa1..2cfbefdfaad 100644 --- a/src/datasets/features/nifti.py +++ b/src/datasets/features/nifti.py @@ -1,6 +1,7 @@ +import base64 import os +import uuid from dataclasses import dataclass, field -from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union @@ -18,6 +19,62 @@ from .features import FeatureType +if config.NIBABEL_AVAILABLE: + import nibabel as nib + + class Nifti1ImageWrapper(nib.nifti1.Nifti1Image): + """ + A wrapper around nibabel's Nifti1Image to customize its representation. + """ + + def __init__(self, nifti_image: nib.nifti1.Nifti1Image): + super().__init__( + dataobj=nifti_image.get_fdata(), + affine=nifti_image.affine, + header=nifti_image.header, + extra=nifti_image.extra, + file_map=nifti_image.file_map, + dtype=nifti_image.get_data_dtype(), + ) + self.nifti_image = nifti_image + + def _repr_html_(self): + bytes_ = self.nifti_image.to_bytes() + b64 = base64.b64encode(bytes_).decode("utf-8") + + self.nifti_data_url = f"data:application/octet-stream;base64,{b64}" + viewer_id = f"papaya-{uuid.uuid4().hex[:8]}" + + html = f""" +
+ + + + """ + return html + @dataclass class Nifti: @@ -106,7 +163,7 @@ def encode_example(self, value: Union[str, bytes, bytearray, dict, "nib.Nifti1Im f"A nifti sample should be a string, bytes, Path, nibabel image, or dict, but got {type(value)}." ) - def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nifti1Image": + def decode_example(self, value: dict, token_per_repo_id=None) -> "Nifti1ImageWrapper": """Decode example NIfTI file into nibabel image object. Args: @@ -165,11 +222,9 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "nib.nifti1.Nif ): # gzip magic number, see https://stackoverflow.com/a/76055284/9534390 or "Magic number" on https://en.wikipedia.org/wiki/Gzip bytes_ = gzip.decompress(bytes_) - bio = BytesIO(bytes_) - fh = nib.FileHolder(fileobj=bio) - nifti = nib.Nifti1Image.from_file_map({"header": fh, "image": fh}) + nifti = nib.Nifti1Image.from_bytes(bytes_) - return nifti + return Nifti1ImageWrapper(nifti) def embed_storage(self, storage: pa.StructArray, token_per_repo_id=None) -> pa.StructArray: """Embed NifTI files into the Arrow array. From 004a5bf4addd9293d6d40f43360c03c8f7e42b28 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:00:19 +0100 Subject: [PATCH 40/52] Replace papaya with niivue (#7878) * try latest papaya * try niivue * update repr_html for nifti to work better with niivue * remove papaya files * remove papaya from setup.py * use ipyniivue * update nifti feature to use ipyniivue * add 3d crosshair for orientation * remove docstring --- setup.py | 2 +- src/datasets/features/nifti.py | 56 ++++++++++++---------------------- 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/setup.py b/setup.py index 2f626763113..30d66fc54db 100644 --- a/setup.py +++ b/setup.py @@ -208,7 +208,7 @@ PDFS_REQUIRE = ["pdfplumber>=0.11.4"] -NIBABEL_REQUIRE = ["nibabel>=5.3.2"] +NIBABEL_REQUIRE = ["nibabel>=5.3.2", "ipyniivue==2.4.2"] EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, diff --git a/src/datasets/features/nifti.py b/src/datasets/features/nifti.py index 2cfbefdfaad..3b118d1cc12 100644 --- a/src/datasets/features/nifti.py +++ b/src/datasets/features/nifti.py @@ -1,6 +1,4 @@ -import base64 import os -import uuid from dataclasses import dataclass, field from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union @@ -39,41 +37,27 @@ def __init__(self, nifti_image: nib.nifti1.Nifti1Image): self.nifti_image = nifti_image def _repr_html_(self): + from ipyniivue import NiiVue, ShowRender, SliceType, Volume + from IPython.display import display + bytes_ = self.nifti_image.to_bytes() - b64 = base64.b64encode(bytes_).decode("utf-8") - - self.nifti_data_url = f"data:application/octet-stream;base64,{b64}" - viewer_id = f"papaya-{uuid.uuid4().hex[:8]}" - - html = f""" -
- - - - """ - return html + nv = NiiVue() + nv.set_slice_type(SliceType.MULTIPLANAR) + nv.opts.multiplanar_show_render = ShowRender.ALWAYS + nv.opts.show_3d_crosshair = True + nv.opts.multiplanar_force_render = True + name = None + if hasattr(self.nifti_image, "file_map"): + if ( + "image" in self.nifti_image.file_map + and getattr(self.nifti_image.file_map["image"], "filename", None) is not None + ): + name = self.nifti_image.file_map["image"].filename + if name is None: + name = "volume.nii.gz" + volume = Volume(name=name, data=bytes_) + nv.load_volumes([volume]) + display(nv) @dataclass From b8291fcbb8f9f90175673e852df40b70f7bf57bf Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:26:02 -0500 Subject: [PATCH 41/52] feat(bids): add pybids optional dependency and config check --- setup.py | 3 +++ src/datasets/config.py | 1 + 2 files changed, 4 insertions(+) diff --git a/setup.py b/setup.py index 30d66fc54db..70042fd5a57 100644 --- a/setup.py +++ b/setup.py @@ -210,6 +210,8 @@ NIBABEL_REQUIRE = ["nibabel>=5.3.2", "ipyniivue==2.4.2"] +PYBIDS_REQUIRE = ["pybids>=0.21.0"] + EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, "vision": VISION_REQUIRE, @@ -228,6 +230,7 @@ "docs": DOCS_REQUIRE, "pdfs": PDFS_REQUIRE, "nibabel": NIBABEL_REQUIRE, + "bids": PYBIDS_REQUIRE, } setup( diff --git a/src/datasets/config.py b/src/datasets/config.py index b6412682727..2df571e4b8f 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -140,6 +140,7 @@ TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None +PYBIDS_AVAILABLE = importlib.util.find_spec("bids") is not None # Optional compression tools RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None From ea343940bd6ed9dc34ad55116cee68ec3a8e9824 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:26:02 -0500 Subject: [PATCH 42/52] test(bids): add synthetic BIDS dataset fixtures --- tests/packaged_modules/test_bids.py | 100 ++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 tests/packaged_modules/test_bids.py diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py new file mode 100644 index 00000000000..90f72d7500f --- /dev/null +++ b/tests/packaged_modules/test_bids.py @@ -0,0 +1,100 @@ +import json +import pytest +import numpy as np +import datasets.config + +@pytest.fixture +def minimal_bids_dataset(tmp_path): + """Minimal valid BIDS dataset with one subject, one T1w scan.""" + # dataset_description.json (required) + (tmp_path / "dataset_description.json").write_text(json.dumps({ + "Name": "Test BIDS Dataset", + "BIDSVersion": "1.10.1" + })) + + # Create subject/anat folder + anat_dir = tmp_path / "sub-01" / "anat" + anat_dir.mkdir(parents=True) + + # Create dummy NIfTI + if datasets.config.NIBABEL_AVAILABLE: + import nibabel as nib + data = np.zeros((4, 4, 4), dtype=np.float32) + img = nib.Nifti1Image(data, np.eye(4)) + nib.save(img, str(anat_dir / "sub-01_T1w.nii.gz")) + else: + # Fallback if nibabel not available (shouldn't happen in test env ideally) + (anat_dir / "sub-01_T1w.nii.gz").write_bytes(b"DUMMY NIFTI CONTENT") + + # JSON sidecar + (anat_dir / "sub-01_T1w.json").write_text(json.dumps({"RepetitionTime": 2.0})) + + return str(tmp_path) + + +@pytest.fixture +def multi_subject_bids(tmp_path): + """BIDS dataset with multiple subjects and sessions.""" + (tmp_path / "dataset_description.json").write_text(json.dumps({ + "Name": "Multi-Subject Test", + "BIDSVersion": "1.10.1" + })) + + data = np.zeros((4, 4, 4), dtype=np.float32) + + if datasets.config.NIBABEL_AVAILABLE: + import nibabel as nib + else: + nib = None + + for sub in ["01", "02"]: + for ses in ["baseline", "followup"]: + anat_dir = tmp_path / f"sub-{sub}" / f"ses-{ses}" / "anat" + anat_dir.mkdir(parents=True) + + file_path = anat_dir / f"sub-{sub}_ses-{ses}_T1w.nii.gz" + if nib: + img = nib.Nifti1Image(data, np.eye(4)) + nib.save(img, str(file_path)) + else: + file_path.write_bytes(b"DUMMY NIFTI CONTENT") + + (anat_dir / f"sub-{sub}_ses-{ses}_T1w.json").write_text( + return str(tmp_path) + + +def test_bids_module_imports(): + from datasets.packaged_modules.bids import Bids, BidsConfig + assert Bids is not None + assert BidsConfig is not None + + +def test_bids_requires_pybids(monkeypatch): + """Test helpful error when pybids not installed.""" + from datasets.packaged_modules.bids.bids import Bids + import datasets.config + + monkeypatch.setattr(datasets.config, "PYBIDS_AVAILABLE", False) + + builder = Bids() + with pytest.raises(ImportError, match="pybids"): + builder._info() + + +@pytest.mark.skipif( + not datasets.config.PYBIDS_AVAILABLE, + reason="pybids not installed" +) +def test_bids_loads_single_subject(minimal_bids_dataset): + from datasets import load_dataset + + ds = load_dataset("bids", data_dir=minimal_bids_dataset, trust_remote_code=True) + + assert "train" in ds + assert len(ds["train"]) == 1 + + sample = ds["train"][0] + assert sample["subject"] == "01" + assert sample["suffix"] == "T1w" + assert sample["datatype"] == "anat" + assert sample["session"] is None From f4418221becaa76311705ef9b559587ceadeb898 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:26:02 -0500 Subject: [PATCH 43/52] feat(bids): implement basic BIDS loader module --- src/datasets/packaged_modules/__init__.py | 2 + .../packaged_modules/bids/__init__.py | 1 + src/datasets/packaged_modules/bids/bids.py | 84 +++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 src/datasets/packaged_modules/bids/__init__.py create mode 100644 src/datasets/packaged_modules/bids/bids.py diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index 9d076df44b7..c43c5f1b460 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -6,6 +6,7 @@ from .arrow import arrow from .audiofolder import audiofolder +from .bids import bids from .cache import cache from .csv import csv from .hdf5 import hdf5 @@ -48,6 +49,7 @@ def _hash_python_lines(lines: list[str]) -> str: "videofolder": (videofolder.__name__, _hash_python_lines(inspect.getsource(videofolder).splitlines())), "pdffolder": (pdffolder.__name__, _hash_python_lines(inspect.getsource(pdffolder).splitlines())), "niftifolder": (niftifolder.__name__, _hash_python_lines(inspect.getsource(niftifolder).splitlines())), + "bids": (bids.__name__, _hash_python_lines(inspect.getsource(bids).splitlines())), "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())), "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())), "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())), diff --git a/src/datasets/packaged_modules/bids/__init__.py b/src/datasets/packaged_modules/bids/__init__.py new file mode 100644 index 00000000000..1d167b51030 --- /dev/null +++ b/src/datasets/packaged_modules/bids/__init__.py @@ -0,0 +1 @@ +from .bids import Bids, BidsConfig diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py new file mode 100644 index 00000000000..6d1bcaf277c --- /dev/null +++ b/src/datasets/packaged_modules/bids/bids.py @@ -0,0 +1,84 @@ +import json +from dataclasses import dataclass +from typing import Optional + +import datasets +from datasets import config + + +logger = datasets.utils.logging.get_logger(__name__) + + +@dataclass +class BidsConfig(datasets.BuilderConfig): + """BuilderConfig for BIDS datasets.""" + data_dir: Optional[str] = None + database_path: Optional[str] = None # For pybids caching + + +class Bids(datasets.GeneratorBasedBuilder): + """BIDS dataset loader using pybids.""" + + BUILDER_CONFIG_CLASS = BidsConfig + + def _info(self): + if not config.PYBIDS_AVAILABLE: + raise ImportError( + "To load BIDS datasets, please install pybids: pip install pybids" + ) + + return datasets.DatasetInfo( + features=datasets.Features({ + "subject": datasets.Value("string"), + "session": datasets.Value("string"), + "datatype": datasets.Value("string"), + "suffix": datasets.Value("string"), + "task": datasets.Value("string"), + "run": datasets.Value("string"), + "path": datasets.Value("string"), + "nifti": datasets.Nifti(), + "metadata": datasets.Value("string"), + }) + ) + + def _split_generators(self, dl_manager): + from bids import BIDSLayout + + if not self.config.data_dir: + raise ValueError("data_dir is required for BIDS datasets") + + layout = BIDSLayout( + self.config.data_dir, + database_path=self.config.database_path, + validate=False, # Don't fail on minor validation issues + ) + + # Get all NIfTI files + nifti_files = layout.get(extension=[".nii", ".nii.gz"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"layout": layout, "files": nifti_files}, + ) + ] + + def _generate_examples(self, layout, files): + for idx, bids_file in enumerate(files): + entities = bids_file.get_entities() + + # Get JSON sidecar metadata + metadata = layout.get_metadata(bids_file.path) + metadata_str = json.dumps(metadata) if metadata else "{}" + + yield idx, { + "subject": entities.get("subject"), + "session": entities.get("session"), + "datatype": entities.get("datatype"), + "suffix": entities.get("suffix"), + "task": entities.get("task"), + "run": str(entities.get("run")) if entities.get("run") else None, + "path": bids_file.path, + "nifti": bids_file.path, + "metadata": metadata_str, + } \ No newline at end of file From d06fcd0489a575d499a9973e2efd9f665b3472e1 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:26:31 -0500 Subject: [PATCH 44/52] fix(test): repair syntax in BIDS test fixture --- tests/packaged_modules/test_bids.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index 90f72d7500f..17b7112c94f 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -60,6 +60,9 @@ def multi_subject_bids(tmp_path): file_path.write_bytes(b"DUMMY NIFTI CONTENT") (anat_dir / f"sub-{sub}_ses-{ses}_T1w.json").write_text( + json.dumps({"RepetitionTime": 2.0}) + ) + return str(tmp_path) From 34be5a40bb31160573334027a7e703dbb35d1bc9 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:27:17 -0500 Subject: [PATCH 45/52] fix(test): handle Bids init exception --- tests/packaged_modules/test_bids.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index 17b7112c94f..42e60bbf35b 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -79,9 +79,8 @@ def test_bids_requires_pybids(monkeypatch): monkeypatch.setattr(datasets.config, "PYBIDS_AVAILABLE", False) - builder = Bids() with pytest.raises(ImportError, match="pybids"): - builder._info() + Bids() @pytest.mark.skipif( From 67a6b6b9741325a7fe5d38e653158a15784456d9 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:27:49 -0500 Subject: [PATCH 46/52] feat(bids): add subject/session/datatype filtering --- src/datasets/packaged_modules/bids/bids.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py index 6d1bcaf277c..56489fb136d 100644 --- a/src/datasets/packaged_modules/bids/bids.py +++ b/src/datasets/packaged_modules/bids/bids.py @@ -14,6 +14,9 @@ class BidsConfig(datasets.BuilderConfig): """BuilderConfig for BIDS datasets.""" data_dir: Optional[str] = None database_path: Optional[str] = None # For pybids caching + subjects: Optional[list[str]] = None # Filter by subject + sessions: Optional[list[str]] = None # Filter by session + datatypes: Optional[list[str]] = None # Filter by datatype class Bids(datasets.GeneratorBasedBuilder): @@ -53,8 +56,17 @@ def _split_generators(self, dl_manager): validate=False, # Don't fail on minor validation issues ) + # Build query kwargs + query = {"extension": [".nii", ".nii.gz"]} + if self.config.subjects: + query["subject"] = self.config.subjects + if self.config.sessions: + query["session"] = self.config.sessions + if self.config.datatypes: + query["datatype"] = self.config.datatypes + # Get all NIfTI files - nifti_files = layout.get(extension=[".nii", ".nii.gz"]) + nifti_files = layout.get(**query) return [ datasets.SplitGenerator( From 2305c2a335c43149181aa820daa61541fd902a2b Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:27:49 -0500 Subject: [PATCH 47/52] test(bids): add multi-subject filtering test --- tests/packaged_modules/test_bids.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index 42e60bbf35b..ee2a4608672 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -100,3 +100,21 @@ def test_bids_loads_single_subject(minimal_bids_dataset): assert sample["suffix"] == "T1w" assert sample["datatype"] == "anat" assert sample["session"] is None + + +@pytest.mark.skipif( + not datasets.config.PYBIDS_AVAILABLE, + reason="pybids not installed" +) +def test_bids_multi_subject(multi_subject_bids): + from datasets import load_dataset + + ds = load_dataset("bids", data_dir=multi_subject_bids, trust_remote_code=True) + + assert len(ds["train"]) == 4 # 2 subjects × 2 sessions + + subjects = set(sample["subject"] for sample in ds["train"]) + assert subjects == {"01", "02"} + + sessions = set(sample["session"] for sample in ds["train"]) + assert sessions == {"baseline", "followup"} From 962ee8b5ad6c42daf96ac9f2ef05cdc2cce1407c Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:28:16 -0500 Subject: [PATCH 48/52] feat(bids): add validation and error handling --- src/datasets/packaged_modules/bids/bids.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py index 56489fb136d..d3918c9774f 100644 --- a/src/datasets/packaged_modules/bids/bids.py +++ b/src/datasets/packaged_modules/bids/bids.py @@ -45,11 +45,21 @@ def _info(self): ) def _split_generators(self, dl_manager): + import os from bids import BIDSLayout if not self.config.data_dir: raise ValueError("data_dir is required for BIDS datasets") + if not os.path.isdir(self.config.data_dir): + raise ValueError(f"data_dir does not exist: {self.config.data_dir}") + + desc_file = os.path.join(self.config.data_dir, "dataset_description.json") + if not os.path.exists(desc_file): + raise ValueError( + f"Not a valid BIDS dataset: missing dataset_description.json in {self.config.data_dir}" + ) + layout = BIDSLayout( self.config.data_dir, database_path=self.config.database_path, @@ -68,6 +78,12 @@ def _split_generators(self, dl_manager): # Get all NIfTI files nifti_files = layout.get(**query) + if not nifti_files: + logger.warning( + f"No NIfTI files found in {self.config.data_dir} with filters: {query}. " + "Check that the dataset is valid BIDS and filters match existing data." + ) + return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, From 6425d93b4d365630f5a6ddfaf34412794bb2e6db Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:29:48 -0500 Subject: [PATCH 49/52] docs(bids): add BIDS loading guide --- docs/source/_toctree.yml | 2 ++ docs/source/bids_dataset.mdx | 63 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 docs/source/bids_dataset.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index cc6b7195fe2..58189f1fd29 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -90,6 +90,8 @@ title: Create a document dataset - local: nifti_dataset title: Create a medical imaging dataset + - local: bids_dataset + title: Load a BIDS dataset title: "Vision" - sections: - local: nlp_load diff --git a/docs/source/bids_dataset.mdx b/docs/source/bids_dataset.mdx new file mode 100644 index 00000000000..62ca79ae770 --- /dev/null +++ b/docs/source/bids_dataset.mdx @@ -0,0 +1,63 @@ +# BIDS Dataset + +[BIDS (Brain Imaging Data Structure)](https://bids.neuroimaging.io/) is a standard for organizing and describing neuroimaging and behavioral data. The `datasets` library supports loading BIDS datasets directly, leveraging `pybids` for parsing and `nibabel` for handling NIfTI files. + + + +To use the BIDS loader, you need to install the `bids` extra: + +```bash +pip install datasets[bids] +``` + + + +## Loading a BIDS Dataset + +You can load a BIDS dataset by pointing to its root directory (containing `dataset_description.json`): + +```python +from datasets import load_dataset + +# Load a local BIDS dataset +ds = load_dataset("bids", data_dir="/path/to/bids/dataset") + +# Access the first example +print(ds["train"][0]) +# { +# 'subject': '01', +# 'session': 'baseline', +# 'datatype': 'anat', +# 'suffix': 'T1w', +# 'nifti': , +# ... +# } +``` + +The `nifti` column contains `nibabel` image objects, which can be visualized interactively in Jupyter notebooks. + +## Filtering + +You can filter the dataset by BIDS entities like `subject`, `session`, and `datatype` when loading: + +```python +# Load only specific subjects and datatypes +ds = load_dataset( + "bids", + data_dir="/path/to/bids/dataset", + subjects=["01", "05", "10"], + sessions=["pre", "post"], + datatypes=["func"], +) +``` + +## Metadata + +BIDS datasets often include JSON sidecar files with metadata (e.g., scanner parameters). This metadata is loaded into the `metadata` column as a JSON string. + +```python +import json + +metadata = json.loads(ds["train"][0]["metadata"]) +print(metadata["RepetitionTime"]) +``` From b748207c5ed8ec00dcc041da96367b1d15053434 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 19:38:26 -0500 Subject: [PATCH 50/52] fix(bids): lint and format fixes, remove deprecated trust_remote_code - Remove deprecated `trust_remote_code=True` from tests (not needed for packaged modules) - Fix ruff linting errors (import sorting, trailing newlines) - Apply ruff formatter for consistent code style - Convert set() generators to set comprehensions (C401) --- src/datasets/packaged_modules/bids/bids.py | 59 ++++++++++++---------- tests/packaged_modules/test_bids.py | 51 +++++++++---------- 2 files changed, 54 insertions(+), 56 deletions(-) diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py index d3918c9774f..278828c5e58 100644 --- a/src/datasets/packaged_modules/bids/bids.py +++ b/src/datasets/packaged_modules/bids/bids.py @@ -12,6 +12,7 @@ @dataclass class BidsConfig(datasets.BuilderConfig): """BuilderConfig for BIDS datasets.""" + data_dir: Optional[str] = None database_path: Optional[str] = None # For pybids caching subjects: Optional[list[str]] = None # Filter by subject @@ -26,26 +27,27 @@ class Bids(datasets.GeneratorBasedBuilder): def _info(self): if not config.PYBIDS_AVAILABLE: - raise ImportError( - "To load BIDS datasets, please install pybids: pip install pybids" - ) + raise ImportError("To load BIDS datasets, please install pybids: pip install pybids") return datasets.DatasetInfo( - features=datasets.Features({ - "subject": datasets.Value("string"), - "session": datasets.Value("string"), - "datatype": datasets.Value("string"), - "suffix": datasets.Value("string"), - "task": datasets.Value("string"), - "run": datasets.Value("string"), - "path": datasets.Value("string"), - "nifti": datasets.Nifti(), - "metadata": datasets.Value("string"), - }) + features=datasets.Features( + { + "subject": datasets.Value("string"), + "session": datasets.Value("string"), + "datatype": datasets.Value("string"), + "suffix": datasets.Value("string"), + "task": datasets.Value("string"), + "run": datasets.Value("string"), + "path": datasets.Value("string"), + "nifti": datasets.Nifti(), + "metadata": datasets.Value("string"), + } + ) ) def _split_generators(self, dl_manager): import os + from bids import BIDSLayout if not self.config.data_dir: @@ -56,9 +58,7 @@ def _split_generators(self, dl_manager): desc_file = os.path.join(self.config.data_dir, "dataset_description.json") if not os.path.exists(desc_file): - raise ValueError( - f"Not a valid BIDS dataset: missing dataset_description.json in {self.config.data_dir}" - ) + raise ValueError(f"Not a valid BIDS dataset: missing dataset_description.json in {self.config.data_dir}") layout = BIDSLayout( self.config.data_dir, @@ -99,14 +99,17 @@ def _generate_examples(self, layout, files): metadata = layout.get_metadata(bids_file.path) metadata_str = json.dumps(metadata) if metadata else "{}" - yield idx, { - "subject": entities.get("subject"), - "session": entities.get("session"), - "datatype": entities.get("datatype"), - "suffix": entities.get("suffix"), - "task": entities.get("task"), - "run": str(entities.get("run")) if entities.get("run") else None, - "path": bids_file.path, - "nifti": bids_file.path, - "metadata": metadata_str, - } \ No newline at end of file + yield ( + idx, + { + "subject": entities.get("subject"), + "session": entities.get("session"), + "datatype": entities.get("datatype"), + "suffix": entities.get("suffix"), + "task": entities.get("task"), + "run": str(entities.get("run")) if entities.get("run") else None, + "path": bids_file.path, + "nifti": bids_file.path, + "metadata": metadata_str, + }, + ) diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index ee2a4608672..03ac2043e6c 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -1,16 +1,18 @@ import json -import pytest + import numpy as np +import pytest + import datasets.config + @pytest.fixture def minimal_bids_dataset(tmp_path): """Minimal valid BIDS dataset with one subject, one T1w scan.""" # dataset_description.json (required) - (tmp_path / "dataset_description.json").write_text(json.dumps({ - "Name": "Test BIDS Dataset", - "BIDSVersion": "1.10.1" - })) + (tmp_path / "dataset_description.json").write_text( + json.dumps({"Name": "Test BIDS Dataset", "BIDSVersion": "1.10.1"}) + ) # Create subject/anat folder anat_dir = tmp_path / "sub-01" / "anat" @@ -19,6 +21,7 @@ def minimal_bids_dataset(tmp_path): # Create dummy NIfTI if datasets.config.NIBABEL_AVAILABLE: import nibabel as nib + data = np.zeros((4, 4, 4), dtype=np.float32) img = nib.Nifti1Image(data, np.eye(4)) nib.save(img, str(anat_dir / "sub-01_T1w.nii.gz")) @@ -35,13 +38,12 @@ def minimal_bids_dataset(tmp_path): @pytest.fixture def multi_subject_bids(tmp_path): """BIDS dataset with multiple subjects and sessions.""" - (tmp_path / "dataset_description.json").write_text(json.dumps({ - "Name": "Multi-Subject Test", - "BIDSVersion": "1.10.1" - })) + (tmp_path / "dataset_description.json").write_text( + json.dumps({"Name": "Multi-Subject Test", "BIDSVersion": "1.10.1"}) + ) data = np.zeros((4, 4, 4), dtype=np.float32) - + if datasets.config.NIBABEL_AVAILABLE: import nibabel as nib else: @@ -51,31 +53,30 @@ def multi_subject_bids(tmp_path): for ses in ["baseline", "followup"]: anat_dir = tmp_path / f"sub-{sub}" / f"ses-{ses}" / "anat" anat_dir.mkdir(parents=True) - + file_path = anat_dir / f"sub-{sub}_ses-{ses}_T1w.nii.gz" if nib: img = nib.Nifti1Image(data, np.eye(4)) nib.save(img, str(file_path)) else: file_path.write_bytes(b"DUMMY NIFTI CONTENT") - - (anat_dir / f"sub-{sub}_ses-{ses}_T1w.json").write_text( - json.dumps({"RepetitionTime": 2.0}) - ) + + (anat_dir / f"sub-{sub}_ses-{ses}_T1w.json").write_text(json.dumps({"RepetitionTime": 2.0})) return str(tmp_path) def test_bids_module_imports(): from datasets.packaged_modules.bids import Bids, BidsConfig + assert Bids is not None assert BidsConfig is not None def test_bids_requires_pybids(monkeypatch): """Test helpful error when pybids not installed.""" - from datasets.packaged_modules.bids.bids import Bids import datasets.config + from datasets.packaged_modules.bids.bids import Bids monkeypatch.setattr(datasets.config, "PYBIDS_AVAILABLE", False) @@ -83,14 +84,11 @@ def test_bids_requires_pybids(monkeypatch): Bids() -@pytest.mark.skipif( - not datasets.config.PYBIDS_AVAILABLE, - reason="pybids not installed" -) +@pytest.mark.skipif(not datasets.config.PYBIDS_AVAILABLE, reason="pybids not installed") def test_bids_loads_single_subject(minimal_bids_dataset): from datasets import load_dataset - ds = load_dataset("bids", data_dir=minimal_bids_dataset, trust_remote_code=True) + ds = load_dataset("bids", data_dir=minimal_bids_dataset) assert "train" in ds assert len(ds["train"]) == 1 @@ -102,19 +100,16 @@ def test_bids_loads_single_subject(minimal_bids_dataset): assert sample["session"] is None -@pytest.mark.skipif( - not datasets.config.PYBIDS_AVAILABLE, - reason="pybids not installed" -) +@pytest.mark.skipif(not datasets.config.PYBIDS_AVAILABLE, reason="pybids not installed") def test_bids_multi_subject(multi_subject_bids): from datasets import load_dataset - ds = load_dataset("bids", data_dir=multi_subject_bids, trust_remote_code=True) + ds = load_dataset("bids", data_dir=multi_subject_bids) assert len(ds["train"]) == 4 # 2 subjects × 2 sessions - subjects = set(sample["subject"] for sample in ds["train"]) + subjects = {sample["subject"] for sample in ds["train"]} assert subjects == {"01", "02"} - sessions = set(sample["session"] for sample in ds["train"]) + sessions = {sample["session"] for sample in ds["train"]} assert sessions == {"baseline", "followup"} From bc5a3fdc00e8de192496561fbaa43f44bf7c16c4 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Fri, 28 Nov 2025 20:06:59 -0500 Subject: [PATCH 51/52] fix(bids): apply CodeRabbit feedback - Update setup.py to include nibabel in BIDS extra - Update docs to clarify nibabel is included - Add nibabel availability check in _info() - Move os import to module level - Update test skipif to check both pybids and nibabel --- docs/source/bids_dataset.mdx | 2 +- setup.py | 2 +- src/datasets/packaged_modules/bids/bids.py | 5 +++-- tests/packaged_modules/test_bids.py | 11 ++++++++--- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/docs/source/bids_dataset.mdx b/docs/source/bids_dataset.mdx index 62ca79ae770..89ca31e566e 100644 --- a/docs/source/bids_dataset.mdx +++ b/docs/source/bids_dataset.mdx @@ -4,7 +4,7 @@ -To use the BIDS loader, you need to install the `bids` extra: +To use the BIDS loader, you need to install the `bids` extra (which installs `pybids` and `nibabel`): ```bash pip install datasets[bids] diff --git a/setup.py b/setup.py index 70042fd5a57..42dd5c101b8 100644 --- a/setup.py +++ b/setup.py @@ -210,7 +210,7 @@ NIBABEL_REQUIRE = ["nibabel>=5.3.2", "ipyniivue==2.4.2"] -PYBIDS_REQUIRE = ["pybids>=0.21.0"] +PYBIDS_REQUIRE = ["pybids>=0.21.0"] + NIBABEL_REQUIRE EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, diff --git a/src/datasets/packaged_modules/bids/bids.py b/src/datasets/packaged_modules/bids/bids.py index 278828c5e58..d165218de4d 100644 --- a/src/datasets/packaged_modules/bids/bids.py +++ b/src/datasets/packaged_modules/bids/bids.py @@ -1,4 +1,5 @@ import json +import os from dataclasses import dataclass from typing import Optional @@ -28,6 +29,8 @@ class Bids(datasets.GeneratorBasedBuilder): def _info(self): if not config.PYBIDS_AVAILABLE: raise ImportError("To load BIDS datasets, please install pybids: pip install pybids") + if not config.NIBABEL_AVAILABLE: + raise ImportError("To load BIDS datasets, please install nibabel: pip install nibabel") return datasets.DatasetInfo( features=datasets.Features( @@ -46,8 +49,6 @@ def _info(self): ) def _split_generators(self, dl_manager): - import os - from bids import BIDSLayout if not self.config.data_dir: diff --git a/tests/packaged_modules/test_bids.py b/tests/packaged_modules/test_bids.py index 03ac2043e6c..8ce2be9b72b 100644 --- a/tests/packaged_modules/test_bids.py +++ b/tests/packaged_modules/test_bids.py @@ -75,7 +75,6 @@ def test_bids_module_imports(): def test_bids_requires_pybids(monkeypatch): """Test helpful error when pybids not installed.""" - import datasets.config from datasets.packaged_modules.bids.bids import Bids monkeypatch.setattr(datasets.config, "PYBIDS_AVAILABLE", False) @@ -84,7 +83,10 @@ def test_bids_requires_pybids(monkeypatch): Bids() -@pytest.mark.skipif(not datasets.config.PYBIDS_AVAILABLE, reason="pybids not installed") +@pytest.mark.skipif( + not datasets.config.PYBIDS_AVAILABLE or not datasets.config.NIBABEL_AVAILABLE, + reason="pybids or nibabel not installed", +) def test_bids_loads_single_subject(minimal_bids_dataset): from datasets import load_dataset @@ -100,7 +102,10 @@ def test_bids_loads_single_subject(minimal_bids_dataset): assert sample["session"] is None -@pytest.mark.skipif(not datasets.config.PYBIDS_AVAILABLE, reason="pybids not installed") +@pytest.mark.skipif( + not datasets.config.PYBIDS_AVAILABLE or not datasets.config.NIBABEL_AVAILABLE, + reason="pybids or nibabel not installed", +) def test_bids_multi_subject(multi_subject_bids): from datasets import load_dataset From fda30c3e9d34ce06cf8f4db5c9c599f17cb51538 Mon Sep 17 00:00:00 2001 From: The-Obstacle-Is-The-Way Date: Sat, 29 Nov 2025 09:45:55 -0500 Subject: [PATCH 52/52] chore: trigger CI