From 4ddde40bf2614bb04bc7a92f08f225ead2dd2109 Mon Sep 17 00:00:00 2001 From: Tengxin Li Date: Wed, 16 Jul 2025 11:11:49 -0700 Subject: [PATCH 1/2] Add code snippet for embeddings normalization --- .../embeddings/normalize_embeddings.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 generative_ai/embeddings/normalize_embeddings.py diff --git a/generative_ai/embeddings/normalize_embeddings.py b/generative_ai/embeddings/normalize_embeddings.py new file mode 100644 index 00000000000..51a907598b0 --- /dev/null +++ b/generative_ai/embeddings/normalize_embeddings.py @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +def normalize_embedding(embedding_np: np.ndarray) -> np.ndarray: + """ + Normalizes an embedding array to have a magnitude (L2 norm) of 1. + + Args: + embedding_np: The input NumPy array to be normalized. + + Returns: + The normalized NumPy array with a magnitude of 1. + Returns the original vector if its magnitude is 0. + """ + # Calculate the L2 norm (magnitude) of the vector + norm = np.linalg.norm(embedding_np) + + # Avoid division by zero if the vector is all zeros + if norm == 0: + return embedding_np + + # Divide the vector by its norm to normalize it + return embedding_np / norm From 178bddbe7c48f8a90b5f9c440b4449358e3713d2 Mon Sep 17 00:00:00 2001 From: Tengxin Li Date: Wed, 16 Jul 2025 15:00:13 -0700 Subject: [PATCH 2/2] update comments and add tests for embedding normalization --- .../embeddings/normalize_embeddings.py | 10 ++++++---- .../embeddings/test_embeddings_examples.py | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/generative_ai/embeddings/normalize_embeddings.py b/generative_ai/embeddings/normalize_embeddings.py index 51a907598b0..0052f6b3ad1 100644 --- a/generative_ai/embeddings/normalize_embeddings.py +++ b/generative_ai/embeddings/normalize_embeddings.py @@ -24,14 +24,16 @@ def normalize_embedding(embedding_np: np.ndarray) -> np.ndarray: Returns: The normalized NumPy array with a magnitude of 1. - Returns the original vector if its magnitude is 0. + Returns the original array if its magnitude is 0. """ - # Calculate the L2 norm (magnitude) of the vector + # Calculate the L2 norm (magnitude) of the array norm = np.linalg.norm(embedding_np) - # Avoid division by zero if the vector is all zeros + # Avoid division by zero if the array is all zeros + # + # An all-zeros embedding array does not exist in theroy if norm == 0: return embedding_np - # Divide the vector by its norm to normalize it + # Divide the array by its norm to normalize it return embedding_np / norm diff --git a/generative_ai/embeddings/test_embeddings_examples.py b/generative_ai/embeddings/test_embeddings_examples.py index b430b978e2c..70b48f52dc1 100644 --- a/generative_ai/embeddings/test_embeddings_examples.py +++ b/generative_ai/embeddings/test_embeddings_examples.py @@ -31,6 +31,7 @@ import multimodal_example import multimodal_image_example import multimodal_video_example +import normalize_embeddings @backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) @@ -97,6 +98,22 @@ def test_code_embed_text() -> None: assert [len(e) for e in embeddings] == [dimensionality or 768] * len(texts) +@backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) +def test_embedding_normalization() -> None: + import numpy as np + + embedding_value = [0.01] * 256 + embedding_np = np.linalg.norm(np.array(embedding_value)) + assert np.isclose(np.linalg.norm(embedding_np), 0.16) + + normalized_embedding_np = normalize_embeddings.normalize_embedding(embedding_np) + assert np.isclose(np.linalg.norm(normalized_embedding_np), 1) + + invalid_embedding_np = np.linalg.norm(np.array([0])) + normalized_embedding_np = normalize_embeddings.normalize_embedding(invalid_embedding_np) + assert np.isclose(np.linalg.norm(normalized_embedding_np), 0) + + @backoff.on_exception(backoff.expo, FailedPrecondition, max_time=300) def dispose(tuning_job) -> None: # noqa: ANN001 if tuning_job._status.name == "PIPELINE_STATE_RUNNING":