Skip to content

Commit 9d6d161

Browse files
authored
Support standalone yaml (#6557)
* support standalone yaml * add test * fix file name * move to config.py
1 parent 4a5b7d9 commit 9d6d161

File tree

7 files changed

+72
-17
lines changed

7 files changed

+72
-17
lines changed

src/datasets/arrow_dataset.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5390,7 +5390,7 @@ def push_to_hub(
53905390
repo_splits = [] # use a list to keep the order of the splits
53915391
repo_files_to_add = [addition.path_in_repo for addition in additions]
53925392
for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
5393-
if repo_file.rfilename == "README.md":
5393+
if repo_file.rfilename == config.REPOCARD_FILENAME:
53945394
repo_with_dataset_card = True
53955395
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
53965396
repo_with_dataset_infos = True
@@ -5421,7 +5421,9 @@ def push_to_hub(
54215421
)
54225422
# get the info from the README to update them
54235423
if repo_with_dataset_card:
5424-
dataset_card_path = api.hf_hub_download(repo_id, "README.md", repo_type="dataset", revision=revision)
5424+
dataset_card_path = api.hf_hub_download(
5425+
repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision
5426+
)
54255427
dataset_card = DatasetCard.load(Path(dataset_card_path))
54265428
dataset_card_data = dataset_card.data
54275429
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
@@ -5523,7 +5525,9 @@ def push_to_hub(
55235525
DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)
55245526
MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data)
55255527
dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card
5526-
additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode()))
5528+
additions.append(
5529+
CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
5530+
)
55275531

55285532
commit_message = commit_message if commit_message is not None else "Upload dataset"
55295533
if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:

src/datasets/commands/test.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,9 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
162162
# Let's move it to the original directory of the dataset script, to allow the user to
163163
# upload them on S3 at the same time afterwards.
164164
if self._save_infos:
165-
dataset_readme_path = os.path.join(builder_cls.get_imported_module_dir(), "README.md")
165+
dataset_readme_path = os.path.join(
166+
builder_cls.get_imported_module_dir(), datasets.config.REPOCARD_FILENAME
167+
)
166168
name = Path(path).name + ".py"
167169
combined_path = os.path.join(path, name)
168170
if os.path.isfile(path):
@@ -177,7 +179,7 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
177179

178180
# Move dataset_info back to the user
179181
if dataset_dir is not None:
180-
user_dataset_readme_path = os.path.join(dataset_dir, "README.md")
182+
user_dataset_readme_path = os.path.join(dataset_dir, datasets.config.REPOCARD_FILENAME)
181183
copyfile(dataset_readme_path, user_dataset_readme_path)
182184
print(f"Dataset card saved at {user_dataset_readme_path}")
183185

src/datasets/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,8 @@
230230
METRIC_INFO_FILENAME = "metric_info.json"
231231
DATASETDICT_JSON_FILENAME = "dataset_dict.json"
232232
METADATA_CONFIGS_FIELD = "configs"
233+
REPOCARD_FILENAME = "README.md"
234+
REPOYAML_FILENAME = ".huggingface.yaml"
233235

234236
MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules"
235237

src/datasets/dataset_dict.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1729,7 +1729,7 @@ def push_to_hub(
17291729
deletions = []
17301730
repo_files_to_add = [addition.path_in_repo for addition in additions]
17311731
for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
1732-
if repo_file.rfilename == "README.md":
1732+
if repo_file.rfilename == config.REPOCARD_FILENAME:
17331733
repo_with_dataset_card = True
17341734
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
17351735
repo_with_dataset_infos = True
@@ -1750,7 +1750,9 @@ def push_to_hub(
17501750

17511751
# get the info from the README to update them
17521752
if repo_with_dataset_card:
1753-
dataset_card_path = api.hf_hub_download(repo_id, "README.md", repo_type="dataset", revision=revision)
1753+
dataset_card_path = api.hf_hub_download(
1754+
repo_id, config.REPOCARD_FILENAME, repo_type="dataset", revision=revision
1755+
)
17541756
dataset_card = DatasetCard.load(Path(dataset_card_path))
17551757
dataset_card_data = dataset_card.data
17561758
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
@@ -1800,7 +1802,9 @@ def push_to_hub(
18001802
DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)
18011803
MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(dataset_card_data)
18021804
dataset_card = DatasetCard(f"---\n{dataset_card_data}\n---\n") if dataset_card is None else dataset_card
1803-
additions.append(CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode()))
1805+
additions.append(
1806+
CommitOperationAdd(path_in_repo=config.REPOCARD_FILENAME, path_or_fileobj=str(dataset_card).encode())
1807+
)
18041808

18051809
commit_message = commit_message if commit_message is not None else "Upload dataset"
18061810
if len(additions) <= config.UPLOADS_MAX_NUMBER_PER_COMMIT:

src/datasets/info.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ class DatasetInfosDict(Dict[str, DatasetInfo]):
397397
def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=False) -> None:
398398
total_dataset_infos = {}
399399
dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
400-
dataset_readme_path = os.path.join(dataset_infos_dir, "README.md")
400+
dataset_readme_path = os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)
401401
if not overwrite:
402402
total_dataset_infos = self.from_directory(dataset_infos_dir)
403403
total_dataset_infos.update(self)
@@ -426,8 +426,8 @@ def write_to_directory(self, dataset_infos_dir, overwrite=False, pretty_print=Fa
426426
def from_directory(cls, dataset_infos_dir) -> "DatasetInfosDict":
427427
logger.info(f"Loading Dataset Infos from {dataset_infos_dir}")
428428
# Load the info from the YAML part of README.md
429-
if os.path.exists(os.path.join(dataset_infos_dir, "README.md")):
430-
dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / "README.md").data
429+
if os.path.exists(os.path.join(dataset_infos_dir, config.REPOCARD_FILENAME)):
430+
dataset_card_data = DatasetCard.load(Path(dataset_infos_dir) / config.REPOCARD_FILENAME).data
431431
if "dataset_info" in dataset_card_data:
432432
return cls.from_dataset_card_data(dataset_card_data)
433433
if os.path.exists(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)):

src/datasets/load.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
import fsspec
3434
import requests
35+
import yaml
3536
from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
3637

3738
from . import config
@@ -928,7 +929,7 @@ def get_module(self) -> DatasetModule:
928929
)
929930
# get script and other files
930931
dataset_infos_path = Path(self.path).parent / config.DATASETDICT_INFOS_FILENAME
931-
dataset_readme_path = Path(self.path).parent / "README.md"
932+
dataset_readme_path = Path(self.path).parent / config.REPOCARD_FILENAME
932933
imports = get_imports(self.path)
933934
local_imports = _download_additional_modules(
934935
name=self.name,
@@ -940,7 +941,7 @@ def get_module(self) -> DatasetModule:
940941
if dataset_infos_path.is_file():
941942
additional_files.append((config.DATASETDICT_INFOS_FILENAME, str(dataset_infos_path)))
942943
if dataset_readme_path.is_file():
943-
additional_files.append(("README.md", dataset_readme_path))
944+
additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path))
944945
# copy the script and the files in an importable directory
945946
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
946947
hash = files_to_hash([self.path] + [loc[1] for loc in local_imports])
@@ -1003,8 +1004,16 @@ def __init__(
10031004
self.download_mode = download_mode
10041005

10051006
def get_module(self) -> DatasetModule:
1006-
readme_path = os.path.join(self.path, "README.md")
1007+
readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
1008+
standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
10071009
dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
1010+
if os.path.exists(standalone_yaml_path):
1011+
with open(standalone_yaml_path, "r", encoding="utf-8") as f:
1012+
standalone_yaml_data = yaml.safe_load(f.read())
1013+
if standalone_yaml_data:
1014+
_dataset_card_data_dict = dataset_card_data.to_dict()
1015+
_dataset_card_data_dict.update(standalone_yaml_data)
1016+
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
10081017
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
10091018
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
10101019
# we need a set of data files to find which dataset builder to use
@@ -1190,12 +1199,28 @@ def get_module(self) -> DatasetModule:
11901199
download_config.download_desc = "Downloading readme"
11911200
try:
11921201
dataset_readme_path = cached_path(
1193-
hf_hub_url(self.name, "README.md", revision=revision),
1202+
hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision),
11941203
download_config=download_config,
11951204
)
11961205
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
11971206
except FileNotFoundError:
11981207
dataset_card_data = DatasetCardData()
1208+
download_config = self.download_config.copy()
1209+
if download_config.download_desc is None:
1210+
download_config.download_desc = "Downloading standalone yaml"
1211+
try:
1212+
standalone_yaml_path = cached_path(
1213+
hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision),
1214+
download_config=download_config,
1215+
)
1216+
with open(standalone_yaml_path, "r", encoding="utf-8") as f:
1217+
standalone_yaml_data = yaml.safe_load(f.read())
1218+
if standalone_yaml_data:
1219+
_dataset_card_data_dict = dataset_card_data.to_dict()
1220+
_dataset_card_data_dict.update(standalone_yaml_data)
1221+
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
1222+
except FileNotFoundError:
1223+
pass
11991224
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
12001225
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
12011226
# we need a set of data files to find which dataset builder to use
@@ -1411,7 +1436,7 @@ def download_dataset_infos_file(self) -> str:
14111436
return None
14121437

14131438
def download_dataset_readme_file(self) -> str:
1414-
readme_url = hf_hub_url(self.name, "README.md", revision=self.revision)
1439+
readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
14151440
# Download the dataset infos file if available
14161441
download_config = self.download_config.copy()
14171442
if download_config.download_desc is None:
@@ -1448,7 +1473,7 @@ def get_module(self) -> DatasetModule:
14481473
if dataset_infos_path:
14491474
additional_files.append((config.DATASETDICT_INFOS_FILENAME, dataset_infos_path))
14501475
if dataset_readme_path:
1451-
additional_files.append(("README.md", dataset_readme_path))
1476+
additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path))
14521477
# copy the script and the files in an importable directory
14531478
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
14541479
hash = files_to_hash([local_path] + [loc[1] for loc in local_imports])

tests/test_load.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,3 +1688,21 @@ def test_reload_old_cache_from_2_15(tmp_path: Path):
16881688
cache_dir / "polinaeterna___audiofolder_two_configs_in_metadata" / "v2" / "0.0.0" / str(builder.hash)
16891689
).as_posix()
16901690
) # new cache
1691+
1692+
1693+
@pytest.mark.integration
1694+
def test_update_dataset_card_data_with_standalone_yaml():
1695+
# Labels defined in .huggingface.yml because they are too long to be in README.md
1696+
from datasets.utils.metadata import MetadataConfigs
1697+
1698+
with patch(
1699+
"datasets.utils.metadata.MetadataConfigs.from_dataset_card_data",
1700+
side_effect=MetadataConfigs.from_dataset_card_data,
1701+
) as card_data_read_mock:
1702+
builder = load_dataset_builder("datasets-maintainers/dataset-with-standalone-yaml")
1703+
assert card_data_read_mock.call_args.args[0]["license"] is not None # from README.md
1704+
assert card_data_read_mock.call_args.args[0]["dataset_info"] is not None # from standalone yaml
1705+
assert card_data_read_mock.call_args.args[0]["tags"] == ["test"] # standalone yaml has precedence
1706+
assert isinstance(
1707+
builder.info.features["label"], datasets.ClassLabel
1708+
) # correctly loaded from long labels list in standalone yaml

0 commit comments

Comments
 (0)