3232
3333import fsspec
3434import requests
35+ import yaml
3536from huggingface_hub import DatasetCard , DatasetCardData , HfApi , HfFileSystem
3637
3738from . import config
@@ -928,7 +929,7 @@ def get_module(self) -> DatasetModule:
928929 )
929930 # get script and other files
930931 dataset_infos_path = Path (self .path ).parent / config .DATASETDICT_INFOS_FILENAME
931- dataset_readme_path = Path (self .path ).parent / "README.md"
932+ dataset_readme_path = Path (self .path ).parent / config . REPOCARD_FILENAME
932933 imports = get_imports (self .path )
933934 local_imports = _download_additional_modules (
934935 name = self .name ,
@@ -940,7 +941,7 @@ def get_module(self) -> DatasetModule:
940941 if dataset_infos_path .is_file ():
941942 additional_files .append ((config .DATASETDICT_INFOS_FILENAME , str (dataset_infos_path )))
942943 if dataset_readme_path .is_file ():
943- additional_files .append (("README.md" , dataset_readme_path ))
944+ additional_files .append ((config . REPOCARD_FILENAME , dataset_readme_path ))
944945 # copy the script and the files in an importable directory
945946 dynamic_modules_path = self .dynamic_modules_path if self .dynamic_modules_path else init_dynamic_modules ()
946947 hash = files_to_hash ([self .path ] + [loc [1 ] for loc in local_imports ])
@@ -1003,8 +1004,16 @@ def __init__(
10031004 self .download_mode = download_mode
10041005
10051006 def get_module (self ) -> DatasetModule :
1006- readme_path = os .path .join (self .path , "README.md" )
1007+ readme_path = os .path .join (self .path , config .REPOCARD_FILENAME )
1008+ standalone_yaml_path = os .path .join (self .path , config .REPOYAML_FILENAME )
10071009 dataset_card_data = DatasetCard .load (readme_path ).data if os .path .isfile (readme_path ) else DatasetCardData ()
1010+ if os .path .exists (standalone_yaml_path ):
1011+ with open (standalone_yaml_path , "r" , encoding = "utf-8" ) as f :
1012+ standalone_yaml_data = yaml .safe_load (f .read ())
1013+ if standalone_yaml_data :
1014+ _dataset_card_data_dict = dataset_card_data .to_dict ()
1015+ _dataset_card_data_dict .update (standalone_yaml_data )
1016+ dataset_card_data = DatasetCardData (** _dataset_card_data_dict )
10081017 metadata_configs = MetadataConfigs .from_dataset_card_data (dataset_card_data )
10091018 dataset_infos = DatasetInfosDict .from_dataset_card_data (dataset_card_data )
10101019 # we need a set of data files to find which dataset builder to use
@@ -1190,12 +1199,28 @@ def get_module(self) -> DatasetModule:
11901199 download_config .download_desc = "Downloading readme"
11911200 try :
11921201 dataset_readme_path = cached_path (
1193- hf_hub_url (self .name , "README.md" , revision = revision ),
1202+ hf_hub_url (self .name , config . REPOCARD_FILENAME , revision = revision ),
11941203 download_config = download_config ,
11951204 )
11961205 dataset_card_data = DatasetCard .load (Path (dataset_readme_path )).data
11971206 except FileNotFoundError :
11981207 dataset_card_data = DatasetCardData ()
1208+ download_config = self .download_config .copy ()
1209+ if download_config .download_desc is None :
1210+ download_config .download_desc = "Downloading standalone yaml"
1211+ try :
1212+ standalone_yaml_path = cached_path (
1213+ hf_hub_url (self .name , config .REPOYAML_FILENAME , revision = revision ),
1214+ download_config = download_config ,
1215+ )
1216+ with open (standalone_yaml_path , "r" , encoding = "utf-8" ) as f :
1217+ standalone_yaml_data = yaml .safe_load (f .read ())
1218+ if standalone_yaml_data :
1219+ _dataset_card_data_dict = dataset_card_data .to_dict ()
1220+ _dataset_card_data_dict .update (standalone_yaml_data )
1221+ dataset_card_data = DatasetCardData (** _dataset_card_data_dict )
1222+ except FileNotFoundError :
1223+ pass
11991224 metadata_configs = MetadataConfigs .from_dataset_card_data (dataset_card_data )
12001225 dataset_infos = DatasetInfosDict .from_dataset_card_data (dataset_card_data )
12011226 # we need a set of data files to find which dataset builder to use
@@ -1411,7 +1436,7 @@ def download_dataset_infos_file(self) -> str:
14111436 return None
14121437
14131438 def download_dataset_readme_file (self ) -> str :
1414- readme_url = hf_hub_url (self .name , "README.md" , revision = self .revision )
1439+ readme_url = hf_hub_url (self .name , config . REPOCARD_FILENAME , revision = self .revision )
14151440 # Download the dataset infos file if available
14161441 download_config = self .download_config .copy ()
14171442 if download_config .download_desc is None :
@@ -1448,7 +1473,7 @@ def get_module(self) -> DatasetModule:
14481473 if dataset_infos_path :
14491474 additional_files .append ((config .DATASETDICT_INFOS_FILENAME , dataset_infos_path ))
14501475 if dataset_readme_path :
1451- additional_files .append (("README.md" , dataset_readme_path ))
1476+ additional_files .append ((config . REPOCARD_FILENAME , dataset_readme_path ))
14521477 # copy the script and the files in an importable directory
14531478 dynamic_modules_path = self .dynamic_modules_path if self .dynamic_modules_path else init_dynamic_modules ()
14541479 hash = files_to_hash ([local_path ] + [loc [1 ] for loc in local_imports ])
0 commit comments