Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 47 additions & 7 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,34 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):
num_commits_after_push = len(self._api.list_repo_commits(ds_name, repo_type="dataset", token=self._token))
assert num_commits_after_push - num_commits_before_push > 1

def _wait_for_repo_ready(self, repo_id, max_wait=30):
"""Wait for repository to be in a consistent state after push operations.

This helper addresses race conditions where rapid successive push_to_hub calls
don't wait for Hub's LFS object propagation between pushes, causing errors like:
"LFS pointer pointed to a file that does not exist"

Args:
repo_id: The repository ID to check.
max_wait: Maximum time in seconds to wait for repository readiness.

Raises:
TimeoutError: If repository is not ready within max_wait seconds.
"""
from huggingface_hub.errors import HfHubHTTPError

start_time = time.monotonic()
while (time.monotonic() - start_time) < max_wait:
try:
# Verify we can list files (repo is consistent)
self._api.list_repo_files(repo_id, repo_type="dataset", token=self._token)
# Small delay to ensure LFS objects are fully propagated
time.sleep(1)
return
except HfHubHTTPError:
time.sleep(1)
raise TimeoutError(f"Repository {repo_id} not ready after {max_wait}s")

def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})
ds2 = Dataset.from_dict({"x": list(range(100)), "y": list(range(100))})
Expand All @@ -278,6 +306,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token)

# Wait for Hub to fully process the first push
self._wait_for_repo_ready(ds_name)

with tempfile.TemporaryDirectory() as tmp:
# Add a file starting with "data" to ensure it doesn't get deleted.
path = Path(tmp) / "datafile.txt"
Expand All @@ -292,6 +323,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
token=self._token,
)

# Wait again before second push
self._wait_for_repo_ready(ds_name)

local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)

# Ensure that there are two files on the repository that have the correct name
Expand Down Expand Up @@ -320,8 +354,11 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):

# Push to hub two times, but the second time with fewer files.
# Verify that the new files contain the correct dataset and that non-necessary files have been deleted.
with temporary_repo(ds_name):
local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)
with temporary_repo() as ds_name_2:
local_ds.push_to_hub(ds_name_2, token=self._token, max_shard_size=500 << 5)

# Wait for Hub to fully process the first push
self._wait_for_repo_ready(ds_name_2)

with tempfile.TemporaryDirectory() as tmp:
# Add a file starting with "data" to ensure it doesn't get deleted.
Expand All @@ -332,15 +369,18 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
self._api.upload_file(
path_or_fileobj=str(path),
path_in_repo="datafile.txt",
repo_id=ds_name,
repo_id=ds_name_2,
repo_type="dataset",
token=self._token,
)

local_ds.push_to_hub(ds_name, token=self._token)
# Wait again before second push
self._wait_for_repo_ready(ds_name_2)

local_ds.push_to_hub(ds_name_2, token=self._token)

# Ensure that there are two files on the repository that have the correct name
files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
files = sorted(self._api.list_repo_files(ds_name_2, repo_type="dataset", token=self._token))
assert files == [
".gitattributes",
"README.md",
Expand All @@ -350,9 +390,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
]

# Keeping the "datafile.txt" breaks the load_dataset to think it's a text-based dataset
self._api.delete_file("datafile.txt", repo_id=ds_name, repo_type="dataset", token=self._token)
self._api.delete_file("datafile.txt", repo_id=ds_name_2, repo_type="dataset", token=self._token)

hub_ds = load_dataset(ds_name, download_mode="force_redownload")
hub_ds = load_dataset(ds_name_2, download_mode="force_redownload")

assert local_ds.column_names == hub_ds.column_names
assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys())
Expand Down