From cf33c748f2c4050819aef32935a17f4097f49072 Mon Sep 17 00:00:00 2001 From: Feiyang Date: Fri, 23 May 2025 23:39:26 +0000 Subject: [PATCH 1/6] Update model name to gemini-embedding-001 in code snipets --- generative_ai/embeddings/batch_example.py | 2 +- generative_ai/embeddings/code_retrieval_example.py | 4 ++-- generative_ai/embeddings/document_retrieval_example.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/generative_ai/embeddings/batch_example.py b/generative_ai/embeddings/batch_example.py index 91be92de79b..c18ecf8d523 100644 --- a/generative_ai/embeddings/batch_example.py +++ b/generative_ai/embeddings/batch_example.py @@ -39,7 +39,7 @@ def embed_text_batch() -> BatchPredictionJob: output_uri = OUTPUT_URI textembedding_model = language_models.TextEmbeddingModel.from_pretrained( - "textembedding-gecko@003" + "gemini-embedding-001" ) batch_prediction_job = textembedding_model.batch_predict( diff --git a/generative_ai/embeddings/code_retrieval_example.py b/generative_ai/embeddings/code_retrieval_example.py index a8b7f8d213f..d4552a7c0ea 100644 --- a/generative_ai/embeddings/code_retrieval_example.py +++ b/generative_ai/embeddings/code_retrieval_example.py @@ -17,14 +17,14 @@ # [START generativeaionvertexai_embedding_code_retrieval] from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel -MODEL_NAME = "text-embedding-005" +MODEL_NAME = "gemini-embedding-001" DIMENSIONALITY = 256 def embed_text( texts: list[str] = ["Retrieve a function that adds two numbers"], task: str = "CODE_RETRIEVAL_QUERY", - model_name: str = "text-embedding-005", + model_name: str = "gemini-embedding-001", dimensionality: int | None = 256, ) -> list[list[float]]: """Embeds texts with a pre-trained, foundational model.""" diff --git a/generative_ai/embeddings/document_retrieval_example.py b/generative_ai/embeddings/document_retrieval_example.py index 9cdeba6220a..d64f16af91d 100644 --- a/generative_ai/embeddings/document_retrieval_example.py +++ b/generative_ai/embeddings/document_retrieval_example.py @@ -32,7 +32,7 @@ def embed_text() -> list[list[float]]: # The task type for embedding. Check the available tasks in the model's documentation. task = "RETRIEVAL_DOCUMENT" - model = TextEmbeddingModel.from_pretrained("text-embedding-005") + model = TextEmbeddingModel.from_pretrained("gemini-embedding-001") inputs = [TextEmbeddingInput(text, task) for text in texts] kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} embeddings = model.get_embeddings(inputs, **kwargs) From 1c31328fc418e44700c916a077ebb78a61237c93 Mon Sep 17 00:00:00 2001 From: Feiyang Date: Sat, 24 May 2025 05:20:44 +0000 Subject: [PATCH 2/6] update to max 3072 dim --- generative_ai/embeddings/code_retrieval_example.py | 4 ++-- generative_ai/embeddings/document_retrieval_example.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/generative_ai/embeddings/code_retrieval_example.py b/generative_ai/embeddings/code_retrieval_example.py index d4552a7c0ea..ec611f03e35 100644 --- a/generative_ai/embeddings/code_retrieval_example.py +++ b/generative_ai/embeddings/code_retrieval_example.py @@ -18,14 +18,14 @@ from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel MODEL_NAME = "gemini-embedding-001" -DIMENSIONALITY = 256 +DIMENSIONALITY = 3072 def embed_text( texts: list[str] = ["Retrieve a function that adds two numbers"], task: str = "CODE_RETRIEVAL_QUERY", model_name: str = "gemini-embedding-001", - dimensionality: int | None = 256, + dimensionality: int | None = 3072, ) -> list[list[float]]: """Embeds texts with a pre-trained, foundational model.""" model = TextEmbeddingModel.from_pretrained(model_name) diff --git a/generative_ai/embeddings/document_retrieval_example.py b/generative_ai/embeddings/document_retrieval_example.py index d64f16af91d..18c25f75833 100644 --- a/generative_ai/embeddings/document_retrieval_example.py +++ b/generative_ai/embeddings/document_retrieval_example.py @@ -28,7 +28,7 @@ def embed_text() -> list[list[float]]: # A list of texts to be embedded. texts = ["banana muffins? ", "banana bread? banana muffins?"] # The dimensionality of the output embeddings. - dimensionality = 256 + dimensionality = 3072 # The task type for embedding. Check the available tasks in the model's documentation. task = "RETRIEVAL_DOCUMENT" From 48b10462731f023f98f23ca3e4e9b4d34f1eefd0 Mon Sep 17 00:00:00 2001 From: Feiyang Date: Tue, 27 May 2025 19:27:08 +0000 Subject: [PATCH 3/6] Update examples to process one input at a time --- .../embeddings/code_retrieval_example.py | 16 ++++++++++++---- .../embeddings/document_retrieval_example.py | 16 +++++++++++----- .../embeddings/test_embeddings_examples.py | 2 +- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/generative_ai/embeddings/code_retrieval_example.py b/generative_ai/embeddings/code_retrieval_example.py index ec611f03e35..107aee6f5c7 100644 --- a/generative_ai/embeddings/code_retrieval_example.py +++ b/generative_ai/embeddings/code_retrieval_example.py @@ -31,10 +31,18 @@ def embed_text( model = TextEmbeddingModel.from_pretrained(model_name) inputs = [TextEmbeddingInput(text, task) for text in texts] kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} - embeddings = model.get_embeddings(inputs, **kwargs) - # Example response: - # [[0.025890009477734566, -0.05553026497364044, 0.006374752148985863,...], - return [embedding.values for embedding in embeddings] + + embeddings = [] + # gemini-embedding-001 takes one input at a time + for text in texts: + text_input = TextEmbeddingInput(text, task) + embedding = model.get_embeddings([text_input], **kwargs) + print(embedding) + # Example response: + # [[0.006135190837085247, -0.01462465338408947, 0.004978656303137541, ...]] + embeddings.append(embedding[0].values) + + return embeddings if __name__ == "__main__": diff --git a/generative_ai/embeddings/document_retrieval_example.py b/generative_ai/embeddings/document_retrieval_example.py index 18c25f75833..fdcb50e75cf 100644 --- a/generative_ai/embeddings/document_retrieval_example.py +++ b/generative_ai/embeddings/document_retrieval_example.py @@ -35,12 +35,18 @@ def embed_text() -> list[list[float]]: model = TextEmbeddingModel.from_pretrained("gemini-embedding-001") inputs = [TextEmbeddingInput(text, task) for text in texts] kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} - embeddings = model.get_embeddings(inputs, **kwargs) - print(embeddings) - # Example response: - # [[0.006135190837085247, -0.01462465338408947, 0.004978656303137541, ...], [0.1234434666, ...]], - return [embedding.values for embedding in embeddings] + embeddings = [] + # gemini-embedding-001 takes one input at a time + for text in texts: + text_input = TextEmbeddingInput(text, task) + embedding = model.get_embeddings([text_input], **kwargs) + print(embedding) + # Example response: + # [[0.006135190837085247, -0.01462465338408947, 0.004978656303137541, ...]] + embeddings.append(embedding[0].values) + + return embeddings # [END generativeaionvertexai_embedding] diff --git a/generative_ai/embeddings/test_embeddings_examples.py b/generative_ai/embeddings/test_embeddings_examples.py index afa350e50db..b4472d25a56 100644 --- a/generative_ai/embeddings/test_embeddings_examples.py +++ b/generative_ai/embeddings/test_embeddings_examples.py @@ -81,7 +81,7 @@ def test_generate_embeddings_with_lower_dimension() -> None: @backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) def test_text_embed_text() -> None: embeddings = document_retrieval_example.embed_text() - assert [len(e) for e in embeddings] == [256, 256] + assert [len(e) for e in embeddings] == [3072, 3072] @backoff.on_exception(backoff.expo, ResourceExhausted, max_time=10) From 41e322128d31c86580a2affa41e7ed5cac985c7e Mon Sep 17 00:00:00 2001 From: Feiyang Date: Tue, 27 May 2025 23:04:00 +0000 Subject: [PATCH 4/6] Fix indentation and unused variable --- generative_ai/embeddings/code_retrieval_example.py | 5 +++-- generative_ai/embeddings/document_retrieval_example.py | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/generative_ai/embeddings/code_retrieval_example.py b/generative_ai/embeddings/code_retrieval_example.py index 107aee6f5c7..565cad8c5e0 100644 --- a/generative_ai/embeddings/code_retrieval_example.py +++ b/generative_ai/embeddings/code_retrieval_example.py @@ -33,8 +33,9 @@ def embed_text( kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} embeddings = [] - # gemini-embedding-001 takes one input at a time - for text in texts: + + # gemini-embedding-001 takes one input at a time + for text in texts: text_input = TextEmbeddingInput(text, task) embedding = model.get_embeddings([text_input], **kwargs) print(embedding) diff --git a/generative_ai/embeddings/document_retrieval_example.py b/generative_ai/embeddings/document_retrieval_example.py index fdcb50e75cf..71e9d6e0a0c 100644 --- a/generative_ai/embeddings/document_retrieval_example.py +++ b/generative_ai/embeddings/document_retrieval_example.py @@ -33,7 +33,6 @@ def embed_text() -> list[list[float]]: task = "RETRIEVAL_DOCUMENT" model = TextEmbeddingModel.from_pretrained("gemini-embedding-001") - inputs = [TextEmbeddingInput(text, task) for text in texts] kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} embeddings = [] From e6b2843db7674417a053265bcb672f62404778b0 Mon Sep 17 00:00:00 2001 From: Feiyang Date: Wed, 28 May 2025 21:04:24 +0000 Subject: [PATCH 5/6] remove unused variable --- generative_ai/embeddings/code_retrieval_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/generative_ai/embeddings/code_retrieval_example.py b/generative_ai/embeddings/code_retrieval_example.py index 565cad8c5e0..0097ca96051 100644 --- a/generative_ai/embeddings/code_retrieval_example.py +++ b/generative_ai/embeddings/code_retrieval_example.py @@ -29,7 +29,6 @@ def embed_text( ) -> list[list[float]]: """Embeds texts with a pre-trained, foundational model.""" model = TextEmbeddingModel.from_pretrained(model_name) - inputs = [TextEmbeddingInput(text, task) for text in texts] kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} embeddings = [] From 62b985c18f70c1eb11dcb48d00a60160bf6fcd70 Mon Sep 17 00:00:00 2001 From: Katie McLaughlin Date: Thu, 29 May 2025 07:09:23 +1000 Subject: [PATCH 6/6] correct linting issue --- generative_ai/embeddings/code_retrieval_example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/generative_ai/embeddings/code_retrieval_example.py b/generative_ai/embeddings/code_retrieval_example.py index 0097ca96051..4bd88fa9366 100644 --- a/generative_ai/embeddings/code_retrieval_example.py +++ b/generative_ai/embeddings/code_retrieval_example.py @@ -32,7 +32,6 @@ def embed_text( kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} embeddings = [] - # gemini-embedding-001 takes one input at a time for text in texts: text_input = TextEmbeddingInput(text, task)