diff --git a/README.md b/README.md index d6cece7..055ba5a 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ output_file: optional. if specified, dump the output to this file instead of std ``` -TODO: share benchmarking results for a common dataset later on. +See [tests/README.md](tests/README.md) for more infor about benchmarking. ## Future Work - [ ] Support `text-only`, `image-only` and `both` (current) options when exporting, and modify model loading logic accordingly. It might be relevant to use a single modality in certain cases, as in large multimodal models, or building and/or searching for semantic image search. diff --git a/ggml b/ggml index dd1d575..c3ae31e 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit dd1d575956e54c5bdc07632f25506b3b1884dbd2 +Subproject commit c3ae31e5a090a6259c674b18983de53ac4538aa6 diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..5a1be8c --- /dev/null +++ b/tests/README.md @@ -0,0 +1,20 @@ +## Tests + +You can use `prepare_imagenet1k.py` to download and prepare the imagenet1k dataset +in a format expected by the `benchmark` utility. +If you haven't already, you need to install torch and torchvision to +use this Python script: + +```sh +pip install -r requirements.txt +``` + +## Note about benchmark results +Please note that the results in this benchmark do not match those reported in the open-clip repository because: + +1. Most importantly, they use a different test protocol that includes averaging vectors of text templates etc. +2. There are still gatchas in the tokenization implementation in this repo. +3. This repo uses a linear interpolation instead of bicubic in image preprocessing. + +The 2nd and 3rd items will be fixed soon. +I don't agree with their test protocol, so I am not so motivated to fix the first item. diff --git a/tests/prepare_imagenet1k.py b/tests/prepare_imagenet1k.py new file mode 100644 index 0000000..1338b95 --- /dev/null +++ b/tests/prepare_imagenet1k.py @@ -0,0 +1,174 @@ +""" +Small script to get and parse imagenet1k dataset into benchmark format + +Dataset comments + Change classes names containing "/" to "or" + Some classes have '/' in their name + For compatibility with folder benchmarks we replace them with 'or' + Skip classes (744, missiles) and (837, sunglasses) as they are duplicates + +""" + +import argparse +import json +import os +from pathlib import Path +import shutil +from subprocess import call +from torchvision.datasets import ImageNet + + +# Files +_CLASSNAMES_FILENAME = "classnames.json" +_CLASSTEMPLATES_FILENAME = "class_templates.json" +_DEVKIT_FILENAME = "ILSVRC2012_devkit_t12.tar.gz" +_IMG_VAL_FILENAME = "ILSVRC2012_img_val.tar" + +# Name for folder with final dataset +_PROCESSED_DIR_NAME = "dataset" + + +def download_dataset(path: Path, verbose: bool = False): + if verbose: + print("Downloading dataset") + path.mkdir(exist_ok=True, parents=True) + + dk_output_path = path / _DEVKIT_FILENAME + iv_output_path = path / _IMG_VAL_FILENAME + + template_path = path / _CLASSTEMPLATES_FILENAME + classnames_path = path / _CLASSNAMES_FILENAME + + if not dk_output_path.exists(): + if verbose: + print("\tDidnt find devkit file, downloading..") + call( + ( + f"wget https://image-net.org/data/ILSVRC/2012/{_DEVKIT_FILENAME} " + + f"--output-document={dk_output_path}" + ), + shell=True, + ) + else: + if verbose: + print("\tFound devkit file, skipping download..") + + if not iv_output_path.exists(): + if verbose: + print("\tDidnt find image validation file, downloading..") + call( + ( + f"wget https://image-net.org/data/ILSVRC/2012/{_IMG_VAL_FILENAME} " + + f"--output-document={iv_output_path}" + ), + shell=True, + ) + else: + if verbose: + print("\tFound image validation file, skipping download..") + + if not template_path.exists(): + if verbose: + print("\tDidnt find class templates file, downloading..") + call( + ( + "wget " + + "https://raw.githubusercontent.com/LAION-AI/CLIP_benchmark/main/clip_benchmark/datasets/en_zeroshot_classification_templates.json " + + f"--output-document={template_path}" + ), + shell=True, + ) + + class_templates = json.load(template_path.open("r")) + class_templates = class_templates["imagenet1k"] + json.dump(class_templates, template_path.open("w"), indent=2) + else: + if verbose: + print("\tFound class templates file, skipping download..") + + if not classnames_path.exists(): + if verbose: + print("\tDidnt find class names file, downloading..") + call( + ( + "wget " + + "https://raw.githubusercontent.com/LAION-AI/CLIP_benchmark/main/clip_benchmark/datasets/en_classnames.json " + + f"--output-document={classnames_path}" + ), + shell=True, + ) + classnames = json.load(classnames_path.open("r")) + classnames = classnames["imagenet1k"] + + if verbose: + print( + "\tFixing classnames, replacing '/' with 'or' and removing duplicates.." + ) + # Described in top comment section + classnames = [ + c.replace("/", "or") + for i, c in enumerate(classnames) + if i not in [744, 837] + ] + + json.dump(classnames, classnames_path.open("w"), indent=2) + + +def parse_dataset(path: Path, verbose=False): + if verbose: + print("Parsing dataset") + # Load cases + classes_path = path.joinpath(_CLASSNAMES_FILENAME) + classes = json.load(classes_path.open("r")) + + # Check if dataset has already been processed + processed_dataset_path = path / _PROCESSED_DIR_NAME + dataset_exists = all(processed_dataset_path.joinpath(c).exists() for c in classes) + + if dataset_exists: + return processed_dataset_path + + processed_dataset_path.mkdir(exist_ok=True) + + # ImageNet dataset handles the parsing + if verbose: + print("\tUnpacking dataset, this can take a bit..") + ds = ImageNet(root=path, split="val") + + # Track with counter as some classes are removed from classes + cls_index = 0 + for i, dir_name in enumerate(ds.wnids): + if dir_name in ["n04356056", "n04008634"]: + if verbose: + print("\tSkipped class", ds.classes[i]) + continue + + class_name = classes[cls_index] + src_dir = Path(ds.split_folder).joinpath(dir_name) + dst_dir = processed_dataset_path.joinpath(class_name) + + os.rename(src=src_dir, dst=dst_dir) + if verbose: + print(f"\tMoved class: {ds.classes[i]} to {class_name}") + + cls_index += 1 + + # Remove other files + shutil.rmtree(ds.split_folder) + if verbose: + print("\tCleaned up unpacked dataset folder") + + return processed_dataset_path + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--save_path", type=str, required=True) + parser.add_argument("--verbose", action=argparse.BooleanOptionalAction) + args = parser.parse_args() + + path = Path(args.save_path).absolute() + + download_dataset(path=path, verbose=args.verbose) + dataset_path = parse_dataset(path=path, verbose=args.verbose) + print(f"Dataset is ready at {dataset_path}") diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 0000000..ac988bd --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,2 @@ +torch +torchvision