Expose zero-shot labeling to Python (#73)

monatis · web-flow · commit 05f2efd8081b · 2023-09-20T12:13:44.000+03:00
* rm __pycache__

* gitignore __pycache__

* gitignore dist

* Upd usearch in image-search example

* Upd usearch in image-search example

* Implement ZSL in clip lib

* Use new ZSL API in examples

* Expose ZSL in Python

* Upd readme in Python bindings

* Bump version in Python bindings
diff --git a/clip.cpp b/clip.cpp
@@ -1518,6 +1518,37 @@ bool softmax_with_sorting(float * arr, const int length, float * sorted_scores,
     return true;
 }
 
+bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, const struct clip_image_u8 * input_img,
+                                const char ** labels, const size_t n_labels, float * scores, int * indices) {
+    // load the image
+    clip_image_f32 img_res;
+
+    const int vec_dim = clip_get_vision_hparams(ctx)->projection_dim;
+
+    clip_image_preprocess(ctx, input_img, &img_res);
+
+    float img_vec[vec_dim];
+    if (!clip_image_encode(ctx, n_threads, &img_res, img_vec, false)) {
+        return false;
+    }
+
+    // encode texts and compute similarities
+    float txt_vec[vec_dim];
+    float similarities[n_labels];
+
+    for (int i = 0; i < n_labels; i++) {
+        const auto & text = labels[i];
+        auto tokens = clip_tokenize(ctx, text);
+        clip_text_encode(ctx, n_threads, &tokens, txt_vec, false);
+        similarities[i] = clip_similarity_score(img_vec, txt_vec, vec_dim);
+    }
+
+    // apply softmax and sort scores
+    softmax_with_sorting(similarities, n_labels, scores, indices);
+
+    return true;
+}
+
 bool image_normalize(const clip_image_u8 * img, clip_image_f32 * res) {
     if (img->nx != 224 || img->ny != 224) {
         printf("%s: long input shape: %d x %d\n", __func__, img->nx, img->ny);
diff --git a/clip.h b/clip.h
@@ -98,6 +98,8 @@ bool clip_compare_text_and_image(const struct clip_ctx * ctx, const int n_thread
                                  const struct clip_image_u8 * image, float * score);
 float clip_similarity_score(const float * vec1, const float * vec2, const int vec_dim);
 bool softmax_with_sorting(float * arr, const int length, float * sorted_scores, int * indices);
+bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, const struct clip_image_u8 * input_img,
+                                const char ** labels, const size_t n_labels, float * scores, int * indices);
 
 #ifdef __cplusplus
 }
diff --git a/examples/common-clip.h b/examples/common-clip.h
@@ -15,13 +15,22 @@ std::map<std::string, std::vector<std::string>> get_dir_keyed_files(const std::s
 
 bool is_image_file_extension(const std::string & path);
 
-struct app_params {
-    int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
+#include <algorithm>
+#include <string>
+#include <vector>
 
-    std::string model = "models/ggml-model-f16.bin";
+struct app_params {
+    int32_t n_threads;
+    std::string model;
     std::vector<std::string> image_paths;
     std::vector<std::string> texts;
-    int verbose = 1;
+    int verbose;
+
+    app_params()
+        : n_threads(std::min(4, static_cast<int32_t>(std::thread::hardware_concurrency()))), model("models/ggml-model-f16.bin"),
+          verbose(1) {
+        // Initialize other fields if needed
+    }
 };
 
 bool app_params_parse(int argc, char ** argv, app_params & params);
diff --git a/examples/image-search/CMakeLists.txt b/examples/image-search/CMakeLists.txt
@@ -5,7 +5,7 @@ set(CXX_STANDARD_REQUIRED ON)
 include(FetchContent)
 FetchContent_Declare(usearch
     GIT_REPOSITORY https://github.com/unum-cloud/usearch.git
-    GIT_TAG v0.20.0
+    GIT_TAG v2.5.0
 )
 FetchContent_MakeAvailable(usearch)
 
diff --git a/examples/python_bindings/README.md b/examples/python_bindings/README.md
@@ -159,7 +159,20 @@ def compare_text_and_image(
 -   `image_path` (str): The path to the image file for comparison.
 -   `n_threads` (int, optional): The number of CPU threads to use for encoding (default is the number of CPU cores).
 
-#### 8. `__del__`
+## 8. `zero_shot_label_image`
+
+```python
+def zero_shot_label_image(
+        self, image_path: str, labels: List[str], n_threads: int = os.cpu_count()
+    ) -> Tuple[List[float], List[int]]:
+```
+
+-   **Description**: Zero-shot labels an image with given candidate labels, returning a tuple of sorted scores and indices.
+-   `image_path` (str): The path to the image file to be labelled.
+-   `labels` (List[str]): A list of candidate labels to be scored.
+-   `n_threads` (int, optional): The number of CPU threads to use for encoding (default is the number of CPU cores).
+
+#### 9. `__del__`
 
 ```python
 def __del__(self):
@@ -175,17 +188,19 @@ A basic example can be found in the [clip.cpp examples](https://github.com/monat
 
 ```
 python example_main.py --help
-usage: clip [-h] -m MODEL [-v VERBOSITY] -t TEXT -i IMAGE
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m MODEL, --model MODEL
-                        path to GGML file
-  -v VERBOSITY, --verbosity VERBOSITY
-                        Level of verbosity. 0 = minimum, 2 = maximum
-  -t TEXT, --text TEXT  text to encode
-  -i IMAGE, --image IMAGE
-                        path to an image file
-```
+usage: clip [-h] -m MODEL [-fn FILENAME] [-v VERBOSITY] -t TEXT [TEXT ...] -i IMAGE                                     
+                                                                                                                        
+optional arguments:                                                                                                     
+  -h, --help            show this help message and exit                                                                 
+  -m MODEL, --model MODEL                                                                                               
+                        path to GGML file or repo_id                                                                    
+  -fn FILENAME, --filename FILENAME                                                                                     
+                        path to GGML file in the Hugging face repo                                                      
+  -v VERBOSITY, --verbosity VERBOSITY                                                                                   
+                        Level of verbosity. 0 = minimum, 2 = maximum                                                    
+  -t TEXT [TEXT ...], --text TEXT [TEXT ...]                                                                            
+                        text to encode. Multiple values allowed. In this case, apply zero-shot labeling                 
+  -i IMAGE, --image IMAGE                                                                                               
+                        path to an image file                                                                           
+``````
 
-Bindings to the DLL are implemented in `clip_cpp/clip.py` and
diff --git a/examples/python_bindings/clip_cpp/clip.py b/examples/python_bindings/clip_cpp/clip.py
@@ -3,7 +3,7 @@
 import platform
 from glob import glob
 from pathlib import Path
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any, Optional, Tuple
 
 from .file_download import ModelInfo, model_download, model_info
 
@@ -167,6 +167,18 @@ class ClipContext(ctypes.Structure):
 ]
 clip_similarity_score.restype = ctypes.c_float
 
+clip_zero_shot_label_image = clip_lib.clip_zero_shot_label_image
+clip_zero_shot_label_image.argtypes = [
+    ctypes.POINTER(ClipContext),
+    ctypes.c_int,
+    ctypes.POINTER(ClipImageU8),
+    ctypes.POINTER(ctypes.c_char_p),
+    ctypes.c_ssize_t,
+    ctypes.POINTER(ctypes.c_float),
+    ctypes.POINTER(ctypes.c_int),
+]
+clip_zero_shot_label_image.restype = ctypes.c_bool
+
 softmax_with_sorting = clip_lib.softmax_with_sorting
 softmax_with_sorting.argtypes = [
     ctypes.POINTER(ctypes.c_float),
@@ -369,6 +381,34 @@ def compare_text_and_image(
 
         return score.value
 
+    def zero_shot_label_image(
+        self, image_path: str, labels: List[str], n_threads: int = os.cpu_count()
+    ) -> Tuple[List[float], List[int]]:
+        n_labels = len(labels)
+        if n_labels < 2:
+            raise ValueError(
+                "You must pass at least 2 labels for zero-shot image labeling"
+            )
+
+        labels = (ctypes.c_char_p * n_labels)(
+            *[ctypes.c_char_p(label.encode("utf8")) for label in labels]
+        )
+        image_ptr = make_clip_image_u8()
+        if not clip_image_load_from_file(image_path.encode("utf8"), image_ptr):
+            raise RuntimeError(f"Could not load image {image_path}")
+
+        scores = (ctypes.c_float * n_labels)()
+        indices = (ctypes.c_int * n_labels)()
+        if not clip_zero_shot_label_image(
+            self.ctx, n_threads, image_ptr, labels, n_labels, scores, indices
+        ):
+            print("function called")
+            raise RuntimeError("Could not zero-shot label image")
+
+        return [scores[i] for i in range(n_labels)], [
+            indices[i] for i in range(n_labels)
+        ]
+
     def __del__(self):
         if hasattr(self, "ctx"):
             clip_free(self.ctx)
diff --git a/examples/python_bindings/example_main.py b/examples/python_bindings/example_main.py
@@ -5,28 +5,39 @@
 if __name__ == "__main__":
     ap = argparse.ArgumentParser(prog="clip")
     ap.add_argument("-m", "--model", help="path to GGML file or repo_id", required=True)
-    ap.add_argument("-fn", "--filename", help="path to GGML file in the Hugging face repo", required=False)
+    ap.add_argument(
+        "-fn",
+        "--filename",
+        help="path to GGML file in the Hugging face repo",
+        required=False,
+    )
     ap.add_argument(
         "-v",
         "--verbosity",
         type=int,
         help="Level of verbosity. 0 = minimum, 2 = maximum",
         default=0,
     )
-    ap.add_argument("-t", "--text", help="text to encode", required=True)
+    ap.add_argument(
+        "-t",
+        "--text",
+        help="text to encode. Multiple values allowed. In this case, apply zero-shot labeling",
+        nargs="+",
+        type=str,
+        required=True,
+    )
     ap.add_argument("-i", "--image", help="path to an image file", required=True)
     args = ap.parse_args()
 
     clip = Clip(args.model, args.verbosity)
-
-    tokens = clip.tokenize(args.text)
-    text_embed = clip.encode_text(tokens)
-
-    image_embed = clip.load_preprocess_encode_image(args.image)
-
-    score = clip.calculate_similarity(text_embed, image_embed)
-
-    # Alternatively, you can just do:
-    # score = clip.compare_text_and_image(text, image_path)
-
-    print(f"Similarity score: {score}")
+    if len(args.text) == 1:
+        score = clip.compare_text_and_image(args.text[0], args.image)
+
+        print(f"Similarity score: {score}")
+    else:
+        sorted_scores, sorted_indices = clip.zero_shot_label_image(
+            args.image, args.text
+        )
+        for ind, score in zip(sorted_indices, sorted_scores):
+            label = args.text[ind]
+            print(f"{label}: {score:.4f}")
diff --git a/examples/python_bindings/pyproject.toml b/examples/python_bindings/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "clip_cpp"
-version = "0.4.1"
+version = "0.4.2"
 description = "CLIP inference with no big dependencies as PyTorch, TensorFlow, Numpy"
 authors = ["Yusuf Sarıgöz <yusufsarigoz@gmail.com>"]
 packages = [{ include = "clip_cpp" }]
diff --git a/examples/zsl.cpp b/examples/zsl.cpp
@@ -10,11 +10,16 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    int n_labels = params.texts.size();
+    const size_t n_labels = params.texts.size();
     if (n_labels < 2) {
         printf("%s: You must specify at least 2 texts for zero-shot labeling\n", __func__);
     }
 
+    const char * labels[n_labels];
+    for (size_t i = 0; i < n_labels; ++i) {
+        labels[i] = params.texts[i].c_str();
+    }
+
     auto ctx = clip_model_load(params.model.c_str(), params.verbose);
     if (!ctx) {
         printf("%s: Unable  to load model from %s", __func__, params.model.c_str());
@@ -23,40 +28,21 @@ int main(int argc, char ** argv) {
 
     // load the image
     const auto & img_path = params.image_paths[0].c_str();
-    clip_image_u8 img0;
-    clip_image_f32 img_res;
-    if (!clip_image_load_from_file(img_path, &img0)) {
+    clip_image_u8 input_img;
+    if (!clip_image_load_from_file(img_path, &input_img)) {
         fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, img_path);
         return 1;
     }
 
-    const int vec_dim = clip_get_vision_hparams(ctx)->projection_dim;
-
-    clip_image_preprocess(ctx, &img0, &img_res);
-
-    float img_vec[vec_dim];
-    if (!clip_image_encode(ctx, params.n_threads, &img_res, img_vec, false)) {
+    float sorted_scores[n_labels];
+    int sorted_indices[n_labels];
+    if (!clip_zero_shot_label_image(ctx, params.n_threads, &input_img, labels, n_labels, sorted_scores, sorted_indices)) {
+        fprintf(stderr, "Unable to apply ZSL\n");
         return 1;
     }
 
-    // encode texts and compute similarities
-    float txt_vec[vec_dim];
-    float similarities[n_labels];
-
-    for (int i = 0; i < n_labels; i++) {
-        const auto & text = params.texts[i].c_str();
-        auto tokens = clip_tokenize(ctx, text);
-        clip_text_encode(ctx, params.n_threads, &tokens, txt_vec, false);
-        similarities[i] = clip_similarity_score(img_vec, txt_vec, vec_dim);
-    }
-
-    // apply softmax and sort scores
-    float sorted_scores[n_labels];
-    int indices[n_labels];
-    softmax_with_sorting(similarities, n_labels, sorted_scores, indices);
-
     for (int i = 0; i < n_labels; i++) {
-        auto label = params.texts[indices[i]].c_str();
+        auto label = labels[sorted_indices[i]];
         float score = sorted_scores[i];
         printf("%s = %1.4f\n", label, score);
     }

Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,8 @@ bool clip_compare_text_and_image(const struct clip_ctx * ctx, const int n_thread`
`98`	`98`	`const struct clip_image_u8 * image, float * score);`
`99`	`99`	`float clip_similarity_score(const float * vec1, const float * vec2, const int vec_dim);`
`100`	`100`	`bool softmax_with_sorting(float * arr, const int length, float * sorted_scores, int * indices);`
	`101`	`+bool clip_zero_shot_label_image(struct clip_ctx * ctx, const int n_threads, const struct clip_image_u8 * input_img,`
	`102`	`+ const char ** labels, const size_t n_labels, float * scores, int * indices);`
`101`	`103`
`102`	`104`	`#ifdef __cplusplus`
`103`	`105`	`}`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@ set(CXX_STANDARD_REQUIRED ON)`
`5`	`5`	`include(FetchContent)`
`6`	`6`	`FetchContent_Declare(usearch`
`7`	`7`	`GIT_REPOSITORY https://github.com/unum-cloud/usearch.git`
`8`		`- GIT_TAG v0.20.0`
	`8`	`+ GIT_TAG v2.5.0`
`9`	`9`	`)`
`10`	`10`	`FetchContent_MakeAvailable(usearch)`
`11`	`11`