Skip to content

allow preprocessed mock data in prototype datasets tests #5706

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 58 additions & 31 deletions test/builtin_dataset_mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from collections import defaultdict, Counter

import numpy as np
import PIL.Image
import pytest
import torch
from datasets_utils import make_zip, make_tar, create_image_folder, create_image_file
Expand Down Expand Up @@ -72,6 +71,28 @@ def prepare(self, home, config):
available_file_names = {path.name for path in root.glob("*")}
required_file_names = {resource.file_name for resource in self.dataset.resources(config)}
missing_file_names = required_file_names - available_file_names
extra_file_names = available_file_names - required_file_names

# Some datasets need to cannot provide the original resources, for example if the preprocessing step includes
# downloads. In such a case the mock data function might also provide the corresponding result of the
# preprocessing. For example, if the dataset requires the `foo.tar` file, but the preprocessing step extracts it
# to the `foo` folder and performs some more operations, the mock data function can provide the `foo` folder
# directly.
# In such a case `foo.tar` will be picked up in `missing_file_names` at first and `foo` in `extra_file_names`.
# Since `foo.tar` is not actually missing, but already in a preprocessed state, we remove the corresponding
# entries from both sets.
if missing_file_names:
for missing in missing_file_names.copy():
extra_candidate = missing.split(".")[0]
if extra_candidate in extra_file_names:
missing_file_names.remove(missing)
extra_file_names.remove(extra_candidate)

if extra_file_names:
raise pytest.UsageError(
f"Dataset '{self.name}' created the files {sequence_to_str(sorted(extra_file_names))} "
f"for {config} in the mock data function, but they are not needed."
)
if missing_file_names:
raise pytest.UsageError(
f"Dataset '{self.name}' requires the files {sequence_to_str(sorted(missing_file_names))} "
Expand Down Expand Up @@ -489,20 +510,11 @@ def imagenet(info, root, config):

class CocoMockData:
@classmethod
def _make_images_archive(cls, root, name, *, num_samples):
image_paths = create_image_folder(
root, name, file_name_fn=lambda idx: f"{idx:012d}.jpg", num_examples=num_samples
)

images_meta = []
for path in image_paths:
with PIL.Image.open(path) as image:
width, height = image.size
images_meta.append(dict(file_name=path.name, id=int(path.stem), width=width, height=height))

make_zip(root, f"{name}.zip")

return images_meta
def _make_images_meta(cls, *, num_samples):
return [
dict(file_name=f"{idx:012d}.jpg", id=idx, width=width, height=height)
for idx, (height, width) in enumerate(torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist())
]

@classmethod
def _make_annotations_json(
Expand Down Expand Up @@ -571,16 +583,27 @@ def generate(
cls,
root,
*,
split,
year,
num_samples,
):
annotations_dir = root / "annotations"
annotations_dir.mkdir()

for split in ("train", "val"):
config_name = f"{split}{year}"
for split_ in ("train", "val"):
config_name = f"{split_}{year}"

images_meta = cls._make_images_meta(num_samples=num_samples)
if split_ == split:
create_image_folder(
root,
config_name,
file_name_fn=lambda idx: images_meta[idx]["file_name"],
num_examples=num_samples,
size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
)
make_zip(root, f"{config_name}.zip")

images_meta = cls._make_images_archive(root, config_name, num_samples=num_samples)
cls._make_annotations(
annotations_dir,
config_name,
Expand All @@ -594,7 +617,7 @@ def generate(

@register_mock
def coco(info, root, config):
return CocoMockData.generate(root, year=config.year, num_samples=5)
return CocoMockData.generate(root, split=config.split, year=config.year, num_samples=5)


class SBDMockData:
Expand Down Expand Up @@ -766,10 +789,12 @@ def add_bndbox(obj):

@classmethod
def generate(cls, root, *, year, trainval):
archive_folder = root
if year == "2011":
archive_folder /= "TrainVal"
data_folder = archive_folder / "VOCdevkit" / f"VOC{year}"
archive_folder = root / "TrainVal"
data_folder = archive_folder / "VOCdevkit"
else:
archive_folder = data_folder = root / "VOCdevkit"
data_folder = data_folder / f"VOC{year}"
data_folder.mkdir(parents=True, exist_ok=True)

ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
Expand All @@ -779,7 +804,7 @@ def generate(cls, root, *, year, trainval):
(cls._make_detection_anns_folder, "Annotations", ".xml"),
]:
make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], data_folder)
make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)

return num_samples_map

Expand Down Expand Up @@ -907,7 +932,7 @@ def country211(info, root, config):
file_name_fn=lambda idx: f"{idx}.jpg",
num_examples=num_examples,
)
make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz")
# make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz")
return num_examples * len(classes)


Expand Down Expand Up @@ -1051,8 +1076,10 @@ def _make_ann_file(path, num_examples, class_idx):
}
)

archive_folder = root / "GTSRB"

if config["split"] == "train":
train_folder = root / "GTSRB" / "Training"
train_folder = archive_folder / "Training"
train_folder.mkdir(parents=True)

for class_idx in classes:
Expand All @@ -1067,9 +1094,9 @@ def _make_ann_file(path, num_examples, class_idx):
num_examples=num_examples_per_class,
class_idx=int(class_idx),
)
make_zip(root, "GTSRB-Training_fixed.zip", train_folder)
make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
else:
test_folder = root / "GTSRB" / "Final_Test"
test_folder = archive_folder / "Final_Test"
test_folder.mkdir(parents=True)

create_image_folder(
Expand All @@ -1079,7 +1106,7 @@ def _make_ann_file(path, num_examples, class_idx):
num_examples=num_examples,
)

make_zip(root, "GTSRB_Final_Test_Images.zip", test_folder)
make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)

_make_ann_file(
path=root / "GT-final_test.csv",
Expand Down Expand Up @@ -1443,11 +1470,11 @@ def stanford_cars(info, root, config):
num_samples = {"train": 5, "test": 7}[config["split"]]
num_categories = 3

devkit = root / "devkit"
devkit.mkdir(parents=True)

if config["split"] == "train":
images_folder_name = "cars_train"

devkit = root / "devkit"
devkit.mkdir()
annotations_mat_path = devkit / "cars_train_annos.mat"
else:
images_folder_name = "cars_test"
Expand Down