From dd7ac197d2ca9ea28ffeb7fc0cf8f0cc22674ef4 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Mon, 21 Oct 2024 13:23:18 -0400 Subject: [PATCH 1/2] parse dtype from kwargs --- redisvl/utils/vectorize/base.py | 6 ++--- redisvl/utils/vectorize/text/azureopenai.py | 18 ++++++++++---- redisvl/utils/vectorize/text/cohere.py | 10 ++++++-- redisvl/utils/vectorize/text/custom.py | 26 ++++++++++++++------- redisvl/utils/vectorize/text/huggingface.py | 9 +++++-- redisvl/utils/vectorize/text/mistral.py | 20 ++++++++++++---- redisvl/utils/vectorize/text/openai.py | 20 ++++++++++++---- redisvl/utils/vectorize/text/vertexai.py | 11 ++++++--- 8 files changed, 87 insertions(+), 33 deletions(-) diff --git a/redisvl/utils/vectorize/base.py b/redisvl/utils/vectorize/base.py index 5fcd1b4a..576a4c1d 100644 --- a/redisvl/utils/vectorize/base.py +++ b/redisvl/utils/vectorize/base.py @@ -81,11 +81,11 @@ def batchify(self, seq: list, size: int, preprocess: Optional[Callable] = None): else: yield seq[pos : pos + size] - def _process_embedding(self, embedding: List[float], as_buffer: bool, **kwargs): + def _process_embedding(self, embedding: List[float], as_buffer: bool, dtype: Optional[str]): if as_buffer: - if "dtype" not in kwargs: + if not dtype: raise RuntimeError( "dtype is required if converting from float to byte string." ) - return array_to_buffer(embedding, kwargs["dtype"]) + return array_to_buffer(embedding, dtype) return embedding diff --git a/redisvl/utils/vectorize/text/azureopenai.py b/redisvl/utils/vectorize/text/azureopenai.py index 3129c0b0..744da1df 100644 --- a/redisvl/utils/vectorize/text/azureopenai.py +++ b/redisvl/utils/vectorize/text/azureopenai.py @@ -189,12 +189,14 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") + + dtype = kwargs.pop("dtype", None) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = self._client.embeddings.create(input=batch, model=self.model) embeddings += [ - self._process_embedding(r.embedding, as_buffer, **kwargs) + self._process_embedding(r.embedding, as_buffer, dtype) for r in response.data ] return embeddings @@ -231,8 +233,11 @@ def embed( if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + result = self._client.embeddings.create(input=[text], model=self.model) - return self._process_embedding(result.data[0].embedding, as_buffer, **kwargs) + return self._process_embedding(result.data[0].embedding, as_buffer, dtype) @retry( wait=wait_random_exponential(min=1, max=60), @@ -268,6 +273,8 @@ async def aembed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") + + dtype = kwargs.pop("dtype", None) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): @@ -275,7 +282,7 @@ async def aembed_many( input=batch, model=self.model ) embeddings += [ - self._process_embedding(r.embedding, as_buffer, **kwargs) + self._process_embedding(r.embedding, as_buffer, dtype) for r in response.data ] return embeddings @@ -312,8 +319,11 @@ async def aembed( if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + result = await self._aclient.embeddings.create(input=[text], model=self.model) - return self._process_embedding(result.data[0].embedding, as_buffer, **kwargs) + return self._process_embedding(result.data[0].embedding, as_buffer, dtype) @property def type(self) -> str: diff --git a/redisvl/utils/vectorize/text/cohere.py b/redisvl/utils/vectorize/text/cohere.py index 94584b91..eb2a42c0 100644 --- a/redisvl/utils/vectorize/text/cohere.py +++ b/redisvl/utils/vectorize/text/cohere.py @@ -155,12 +155,16 @@ def embed( "Must pass in a str value for cohere embedding input_type. \ See https://docs.cohere.com/reference/embed." ) + if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + embedding = self._client.embed( texts=[text], model=self.model, input_type=input_type ).embeddings[0] - return self._process_embedding(embedding, as_buffer, **kwargs) + return self._process_embedding(embedding, as_buffer, dtype) @retry( wait=wait_random_exponential(min=1, max=60), @@ -223,6 +227,8 @@ def embed_many( "Must pass in a str value for cohere embedding input_type.\ See https://docs.cohere.com/reference/embed." ) + + dtype = kwargs.pop("dtype", None) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): @@ -230,7 +236,7 @@ def embed_many( texts=batch, model=self.model, input_type=input_type ) embeddings += [ - self._process_embedding(embedding, as_buffer, **kwargs) + self._process_embedding(embedding, as_buffer, dtype) for embedding in response.embeddings ] return embeddings diff --git a/redisvl/utils/vectorize/text/custom.py b/redisvl/utils/vectorize/text/custom.py index 8dc42c12..dd08e5de 100644 --- a/redisvl/utils/vectorize/text/custom.py +++ b/redisvl/utils/vectorize/text/custom.py @@ -172,9 +172,11 @@ def embed( if preprocess: text = preprocess(text) - else: - result = self._embed_func(text, **kwargs) - return self._process_embedding(result, as_buffer, **kwargs) + + dtype = kwargs.pop("dtype", None) + + result = self._embed_func(text, **kwargs) + return self._process_embedding(result, as_buffer, dtype) def embed_many( self, @@ -209,12 +211,14 @@ def embed_many( if not self._embed_many_func: raise NotImplementedError + + dtype = kwargs.pop("dtype", None) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): results = self._embed_many_func(batch, **kwargs) embeddings += [ - self._process_embedding(r, as_buffer, **kwargs) for r in results + self._process_embedding(r, as_buffer, dtype) for r in results ] return embeddings @@ -249,9 +253,11 @@ async def aembed( if preprocess: text = preprocess(text) - else: - result = await self._aembed_func(text, **kwargs) - return self._process_embedding(result, as_buffer, **kwargs) + + dtype = kwargs.pop("dtype", None) + + result = await self._aembed_func(text, **kwargs) + return self._process_embedding(result, as_buffer, dtype) async def aembed_many( self, @@ -286,12 +292,14 @@ async def aembed_many( if not self._aembed_many_func: raise NotImplementedError - + + dtype = kwargs.pop("dtype", None) + embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): results = await self._aembed_many_func(batch, **kwargs) embeddings += [ - self._process_embedding(r, as_buffer, **kwargs) for r in results + self._process_embedding(r, as_buffer, dtype) for r in results ] return embeddings diff --git a/redisvl/utils/vectorize/text/huggingface.py b/redisvl/utils/vectorize/text/huggingface.py index fdeb0b64..630b1cb6 100644 --- a/redisvl/utils/vectorize/text/huggingface.py +++ b/redisvl/utils/vectorize/text/huggingface.py @@ -99,8 +99,11 @@ def embed( if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + embedding = self._client.encode([text], **kwargs)[0] - return self._process_embedding(embedding.tolist(), as_buffer, **kwargs) + return self._process_embedding(embedding.tolist(), as_buffer, dtype) def embed_many( self, @@ -132,13 +135,15 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") + + dtype = kwargs.pop("dtype", None) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): batch_embeddings = self._client.encode(batch, **kwargs) embeddings.extend( [ - self._process_embedding(embedding.tolist(), as_buffer, **kwargs) + self._process_embedding(embedding.tolist(), as_buffer, dtype) for embedding in batch_embeddings ] ) diff --git a/redisvl/utils/vectorize/text/mistral.py b/redisvl/utils/vectorize/text/mistral.py index 7d4f00f5..41331353 100644 --- a/redisvl/utils/vectorize/text/mistral.py +++ b/redisvl/utils/vectorize/text/mistral.py @@ -139,12 +139,14 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + + dtype = kwargs.pop("dtype", None) + embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = self._client.embeddings(model=self.model, input=batch) embeddings += [ - self._process_embedding(r.embedding, as_buffer, **kwargs) + self._process_embedding(r.embedding, as_buffer, dtype) for r in response.data ] return embeddings @@ -181,8 +183,11 @@ def embed( if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + result = self._client.embeddings(model=self.model, input=[text]) - return self._process_embedding(result.data[0].embedding, as_buffer, **kwargs) + return self._process_embedding(result.data[0].embedding, as_buffer, dtype) @retry( wait=wait_random_exponential(min=1, max=60), @@ -218,12 +223,14 @@ async def aembed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") + + dtype = kwargs.pop("dtype", None) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = await self._aclient.embeddings(model=self.model, input=batch) embeddings += [ - self._process_embedding(r.embedding, as_buffer, **kwargs) + self._process_embedding(r.embedding, as_buffer, dtype) for r in response.data ] return embeddings @@ -260,8 +267,11 @@ async def aembed( if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + result = await self._aclient.embeddings(model=self.model, input=[text]) - return self._process_embedding(result.data[0].embedding, as_buffer, **kwargs) + return self._process_embedding(result.data[0].embedding, as_buffer, dtype) @property def type(self) -> str: diff --git a/redisvl/utils/vectorize/text/openai.py b/redisvl/utils/vectorize/text/openai.py index ae5d19dc..837ac92d 100644 --- a/redisvl/utils/vectorize/text/openai.py +++ b/redisvl/utils/vectorize/text/openai.py @@ -143,12 +143,14 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + + dtype = kwargs.pop("dtype", None) + embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = self._client.embeddings.create(input=batch, model=self.model) embeddings += [ - self._process_embedding(r.embedding, as_buffer, **kwargs) + self._process_embedding(r.embedding, as_buffer, dtype) for r in response.data ] return embeddings @@ -185,8 +187,11 @@ def embed( if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + result = self._client.embeddings.create(input=[text], model=self.model) - return self._process_embedding(result.data[0].embedding, as_buffer, **kwargs) + return self._process_embedding(result.data[0].embedding, as_buffer, dtype) @retry( wait=wait_random_exponential(min=1, max=60), @@ -222,6 +227,8 @@ async def aembed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") + + dtype = kwargs.pop("dtype", None) embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): @@ -229,7 +236,7 @@ async def aembed_many( input=batch, model=self.model ) embeddings += [ - self._process_embedding(r.embedding, as_buffer, **kwargs) + self._process_embedding(r.embedding, as_buffer, dtype) for r in response.data ] return embeddings @@ -266,8 +273,11 @@ async def aembed( if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + result = await self._aclient.embeddings.create(input=[text], model=self.model) - return self._process_embedding(result.data[0].embedding, as_buffer, **kwargs) + return self._process_embedding(result.data[0].embedding, as_buffer, dtype) @property def type(self) -> str: diff --git a/redisvl/utils/vectorize/text/vertexai.py b/redisvl/utils/vectorize/text/vertexai.py index 71e2e433..d174ffc1 100644 --- a/redisvl/utils/vectorize/text/vertexai.py +++ b/redisvl/utils/vectorize/text/vertexai.py @@ -150,12 +150,14 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + + dtype = kwargs.pop("dtype", None) + embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = self._client.get_embeddings(batch) embeddings += [ - self._process_embedding(r.values, as_buffer, **kwargs) for r in response + self._process_embedding(r.values, as_buffer, dtype) for r in response ] return embeddings @@ -191,8 +193,11 @@ def embed( if preprocess: text = preprocess(text) + + dtype = kwargs.pop("dtype", None) + result = self._client.get_embeddings([text]) - return self._process_embedding(result[0].values, as_buffer, **kwargs) + return self._process_embedding(result[0].values, as_buffer, dtype) @property def type(self) -> str: From 90626af5ac438d7891d5410d1a29d5f5f72f1428 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Mon, 21 Oct 2024 13:59:09 -0400 Subject: [PATCH 2/2] fix formatting --- redisvl/utils/vectorize/base.py | 4 +++- redisvl/utils/vectorize/text/azureopenai.py | 8 ++++---- redisvl/utils/vectorize/text/cohere.py | 8 ++++---- redisvl/utils/vectorize/text/custom.py | 8 ++++---- redisvl/utils/vectorize/text/huggingface.py | 2 +- redisvl/utils/vectorize/text/mistral.py | 10 +++++----- redisvl/utils/vectorize/text/openai.py | 10 +++++----- redisvl/utils/vectorize/text/vertexai.py | 6 +++--- 8 files changed, 29 insertions(+), 27 deletions(-) diff --git a/redisvl/utils/vectorize/base.py b/redisvl/utils/vectorize/base.py index 576a4c1d..238aa6a0 100644 --- a/redisvl/utils/vectorize/base.py +++ b/redisvl/utils/vectorize/base.py @@ -81,7 +81,9 @@ def batchify(self, seq: list, size: int, preprocess: Optional[Callable] = None): else: yield seq[pos : pos + size] - def _process_embedding(self, embedding: List[float], as_buffer: bool, dtype: Optional[str]): + def _process_embedding( + self, embedding: List[float], as_buffer: bool, dtype: Optional[str] + ): if as_buffer: if not dtype: raise RuntimeError( diff --git a/redisvl/utils/vectorize/text/azureopenai.py b/redisvl/utils/vectorize/text/azureopenai.py index 744da1df..a387e238 100644 --- a/redisvl/utils/vectorize/text/azureopenai.py +++ b/redisvl/utils/vectorize/text/azureopenai.py @@ -189,7 +189,7 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + dtype = kwargs.pop("dtype", None) embeddings: List = [] @@ -233,7 +233,7 @@ def embed( if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) result = self._client.embeddings.create(input=[text], model=self.model) @@ -273,7 +273,7 @@ async def aembed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + dtype = kwargs.pop("dtype", None) embeddings: List = [] @@ -319,7 +319,7 @@ async def aembed( if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) result = await self._aclient.embeddings.create(input=[text], model=self.model) diff --git a/redisvl/utils/vectorize/text/cohere.py b/redisvl/utils/vectorize/text/cohere.py index eb2a42c0..469035fa 100644 --- a/redisvl/utils/vectorize/text/cohere.py +++ b/redisvl/utils/vectorize/text/cohere.py @@ -155,12 +155,12 @@ def embed( "Must pass in a str value for cohere embedding input_type. \ See https://docs.cohere.com/reference/embed." ) - + if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) - + embedding = self._client.embed( texts=[text], model=self.model, input_type=input_type ).embeddings[0] @@ -227,7 +227,7 @@ def embed_many( "Must pass in a str value for cohere embedding input_type.\ See https://docs.cohere.com/reference/embed." ) - + dtype = kwargs.pop("dtype", None) embeddings: List = [] diff --git a/redisvl/utils/vectorize/text/custom.py b/redisvl/utils/vectorize/text/custom.py index dd08e5de..a950f6df 100644 --- a/redisvl/utils/vectorize/text/custom.py +++ b/redisvl/utils/vectorize/text/custom.py @@ -211,7 +211,7 @@ def embed_many( if not self._embed_many_func: raise NotImplementedError - + dtype = kwargs.pop("dtype", None) embeddings: List = [] @@ -253,7 +253,7 @@ async def aembed( if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) result = await self._aembed_func(text, **kwargs) @@ -292,9 +292,9 @@ async def aembed_many( if not self._aembed_many_func: raise NotImplementedError - + dtype = kwargs.pop("dtype", None) - + embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): results = await self._aembed_many_func(batch, **kwargs) diff --git a/redisvl/utils/vectorize/text/huggingface.py b/redisvl/utils/vectorize/text/huggingface.py index 630b1cb6..b570a03f 100644 --- a/redisvl/utils/vectorize/text/huggingface.py +++ b/redisvl/utils/vectorize/text/huggingface.py @@ -135,7 +135,7 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + dtype = kwargs.pop("dtype", None) embeddings: List = [] diff --git a/redisvl/utils/vectorize/text/mistral.py b/redisvl/utils/vectorize/text/mistral.py index 41331353..28377778 100644 --- a/redisvl/utils/vectorize/text/mistral.py +++ b/redisvl/utils/vectorize/text/mistral.py @@ -139,9 +139,9 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + dtype = kwargs.pop("dtype", None) - + embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = self._client.embeddings(model=self.model, input=batch) @@ -183,7 +183,7 @@ def embed( if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) result = self._client.embeddings(model=self.model, input=[text]) @@ -223,7 +223,7 @@ async def aembed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + dtype = kwargs.pop("dtype", None) embeddings: List = [] @@ -267,7 +267,7 @@ async def aembed( if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) result = await self._aclient.embeddings(model=self.model, input=[text]) diff --git a/redisvl/utils/vectorize/text/openai.py b/redisvl/utils/vectorize/text/openai.py index 837ac92d..aad29198 100644 --- a/redisvl/utils/vectorize/text/openai.py +++ b/redisvl/utils/vectorize/text/openai.py @@ -143,9 +143,9 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + dtype = kwargs.pop("dtype", None) - + embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = self._client.embeddings.create(input=batch, model=self.model) @@ -187,7 +187,7 @@ def embed( if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) result = self._client.embeddings.create(input=[text], model=self.model) @@ -227,7 +227,7 @@ async def aembed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + dtype = kwargs.pop("dtype", None) embeddings: List = [] @@ -273,7 +273,7 @@ async def aembed( if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) result = await self._aclient.embeddings.create(input=[text], model=self.model) diff --git a/redisvl/utils/vectorize/text/vertexai.py b/redisvl/utils/vectorize/text/vertexai.py index d174ffc1..f0c3c475 100644 --- a/redisvl/utils/vectorize/text/vertexai.py +++ b/redisvl/utils/vectorize/text/vertexai.py @@ -150,9 +150,9 @@ def embed_many( raise TypeError("Must pass in a list of str values to embed.") if len(texts) > 0 and not isinstance(texts[0], str): raise TypeError("Must pass in a list of str values to embed.") - + dtype = kwargs.pop("dtype", None) - + embeddings: List = [] for batch in self.batchify(texts, batch_size, preprocess): response = self._client.get_embeddings(batch) @@ -193,7 +193,7 @@ def embed( if preprocess: text = preprocess(text) - + dtype = kwargs.pop("dtype", None) result = self._client.get_embeddings([text])