From 26cbfd900a02a9ef547bab19bd7fdeaa07c279be Mon Sep 17 00:00:00 2001 From: Vijay Daita Date: Mon, 10 Jun 2024 14:53:04 -0500 Subject: [PATCH 1/3] Adding embeddings --- repoqa/provider/embeddings/base.py | 9 +++++ repoqa/provider/embeddings/openai.py | 25 ++++++++++++++ repoqa/provider/request/openai.py | 19 +++++++++-- repoqa/search_needle_function.py | 49 ++++++++++++++++++++++++---- requirements.txt | 1 + 5 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 repoqa/provider/embeddings/base.py create mode 100644 repoqa/provider/embeddings/openai.py diff --git a/repoqa/provider/embeddings/base.py b/repoqa/provider/embeddings/base.py new file mode 100644 index 0000000..3be0611 --- /dev/null +++ b/repoqa/provider/embeddings/base.py @@ -0,0 +1,9 @@ +from abc import ABC, abstractmethod +from typing import List + +class BaseEmbeddingsProvider(ABC): + @abstractmethod + def find_best_match( + self, description, snippets, threshold=0 + ) -> str: + ... \ No newline at end of file diff --git a/repoqa/provider/embeddings/openai.py b/repoqa/provider/embeddings/openai.py new file mode 100644 index 0000000..c8e7b1c --- /dev/null +++ b/repoqa/provider/embeddings/openai.py @@ -0,0 +1,25 @@ +import os +from typing import List, Tuple + +from openai import Client +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np + +from repoqa.provider.embeddings.base import BaseEmbeddingsProvider +from repoqa.provider.request.openai import make_auto_embeddings_request + +class OpenAIEmbeddingsProvider(BaseEmbeddingsProvider): + def __init__(self, model, base_url: str = None): + self.model = model + self.client = Client( + api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=base_url + ) + + def find_best_match( + self, description, snippets, threshold=0 + ) -> str: + all_texts = [ description ] + snippets + embedded_texts = make_auto_embeddings_request(self.client, all_texts, self.model) + similarities = cosine_similarity([embedded_texts[0]], embedded_texts[1:])[0] + index = np.argmax(similarities) + return all_texts[index + 1] \ No newline at end of file diff --git a/repoqa/provider/request/openai.py b/repoqa/provider/request/openai.py index 6f6e213..9c6e47b 100644 --- a/repoqa/provider/request/openai.py +++ b/repoqa/provider/request/openai.py @@ -4,6 +4,7 @@ import signal import time +from typing import List import openai from openai.types.chat import ChatCompletion @@ -30,19 +31,25 @@ def make_request( **kwargs, ) +def make_embeddings_request( + client: openai.Client, + texts: List[str], + model: str, +) -> List[List[float]]: + response = client.embeddings.create(input=texts, model=model, encoding_format="float") + return [d.embedding for d in response.data] def handler(signum, frame): # swallow signum and frame raise Exception("end of time") - -def make_auto_request(*args, **kwargs) -> ChatCompletion: +def make_request_with_retry(func, *args, **kwargs) -> ChatCompletion | List[List[float]]: ret = None while ret is None: try: signal.signal(signal.SIGALRM, handler) signal.alarm(100) - ret = make_request(*args, **kwargs) + ret = func(*args, **kwargs) signal.alarm(0) except openai.RateLimitError: print("Rate limit exceeded. Waiting...") @@ -61,3 +68,9 @@ def make_auto_request(*args, **kwargs) -> ChatCompletion: signal.alarm(0) time.sleep(1) return ret + +def make_auto_request(*args, **kwargs) -> ChatCompletion: + return make_request_with_retry(make_request, *args, **kwargs) + +def make_auto_embeddings_request(*args, **kwargs) -> List[List[float]]: + return make_request_with_retry(make_embeddings_request, *args, **kwargs) \ No newline at end of file diff --git a/repoqa/search_needle_function.py b/repoqa/search_needle_function.py index 1507902..3e02c66 100644 --- a/repoqa/search_needle_function.py +++ b/repoqa/search_needle_function.py @@ -5,6 +5,7 @@ import json import os from typing import List, Tuple +import difflib from transformers import AutoTokenizer from tree_sitter_languages import get_language, get_parser @@ -29,6 +30,16 @@ " please retrieve and repeat the exact described function from the code context in a code block wrapped by ```:" ) +def _find_line(text, index): + if index < 0 or index >= len(text): + raise IndexError() + line = 0 + for i, ch in enumerate(text): + if i == index: + return line + if ch == "\n" or ch == "\r": + line += 1 + return line def _backward_tokenizable_lines(lines, tokenizer, max_tokens): """Return the text and tokens from bottom to top""" @@ -374,6 +385,8 @@ def evaluate_model( eval_ignore_comments: bool = False, # ignore comments during score computation trust_remote_code: bool = False, attn_implementation=None, + is_embedding: bool = False, + embedding_context_chunk_size: int = 30 ): if backend is None: if base_url is not None: @@ -515,9 +528,14 @@ def evaluate_model( return if backend == "openai": - from repoqa.provider.openai import OpenAIProvider + if is_embedding: + from repoqa.provider.embeddings.openai import OpenAIEmbeddingsProvider + + engine = OpenAIEmbeddingsProvider(model, base_url=base_url) + else: + from repoqa.provider.openai import OpenAIProvider - engine = OpenAIProvider(model, base_url=base_url) + engine = OpenAIProvider(model, base_url=base_url) elif backend == "vllm": from repoqa.provider.vllm import VllmProvider @@ -563,10 +581,29 @@ def evaluate_model( prompt = "" for key in task["template"].split("\n"): prompt += task[key] - - replies = engine.generate_reply( - prompt, n=1, max_tokens=max_new_tokens, system_msg=system_message - ) + + if is_embedding: + tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf") + tokenized_code_context = tokenizer.encode(task["code_context"]) + prefix = tokenizer.decode(tokenized_code_context[:task["needle_token_start"]]) + needle = tokenizer.decode(tokenized_code_context[task["needle_token_start"]:task["needle_token_end"]]) + suffix = tokenizer.decode(tokenized_code_context[task["needle_token_end"]:]) + + prefix_lines = prefix.splitlines() + suffix_lines = suffix.splitlines() + + prefix_split = ["\n".join(prefix_lines[line:min(line + embedding_context_chunk_size, len(prefix_lines))]) for line in range(0, len(prefix_lines), embedding_context_chunk_size)] + suffix_split = ["\n".join(suffix_lines[line:min(line + embedding_context_chunk_size, len(suffix_lines))]) for line in range(0, len(suffix_lines), embedding_context_chunk_size)] + snippets = prefix_split + [needle] + suffix_split + + replies = engine.find_best_match( + task["description"], + snippets + ) + else: + replies = engine.generate_reply( + prompt, n=1, max_tokens=max_new_tokens, system_msg=system_message + ) result = {**task, "output": replies} f_out.write(json.dumps(result) + "\n") f_out.flush() diff --git a/requirements.txt b/requirements.txt index d7e81ac..e9c3b1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ anthropic google-generativeai vllm stop-sequencer +scikit-learn \ No newline at end of file From f94d064c5a83a1134695d1ee6fd9adab73801b65 Mon Sep 17 00:00:00 2001 From: Vijay Daita Date: Mon, 10 Jun 2024 15:11:49 -0500 Subject: [PATCH 2/3] Removing empty strings --- repoqa/search_needle_function.py | 1 + 1 file changed, 1 insertion(+) diff --git a/repoqa/search_needle_function.py b/repoqa/search_needle_function.py index 3e02c66..b7e48e3 100644 --- a/repoqa/search_needle_function.py +++ b/repoqa/search_needle_function.py @@ -595,6 +595,7 @@ def evaluate_model( prefix_split = ["\n".join(prefix_lines[line:min(line + embedding_context_chunk_size, len(prefix_lines))]) for line in range(0, len(prefix_lines), embedding_context_chunk_size)] suffix_split = ["\n".join(suffix_lines[line:min(line + embedding_context_chunk_size, len(suffix_lines))]) for line in range(0, len(suffix_lines), embedding_context_chunk_size)] snippets = prefix_split + [needle] + suffix_split + snippets = [snippet for snippet in snippets if len(snippet.strip()) > 0] replies = engine.find_best_match( task["description"], From a2cf56aad3976417ea42e869c99026d54c2576b6 Mon Sep 17 00:00:00 2001 From: Vijay Daita Date: Wed, 12 Jun 2024 19:10:54 -0500 Subject: [PATCH 3/3] changed variable name in search_needle_function, no longer using sklearn for cosine similarity --- repoqa/provider/embeddings/openai.py | 17 +++++++++++++---- repoqa/search_needle_function.py | 6 +++--- requirements.txt | 3 +-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/repoqa/provider/embeddings/openai.py b/repoqa/provider/embeddings/openai.py index c8e7b1c..49dc5c7 100644 --- a/repoqa/provider/embeddings/openai.py +++ b/repoqa/provider/embeddings/openai.py @@ -2,7 +2,6 @@ from typing import List, Tuple from openai import Client -from sklearn.metrics.pairwise import cosine_similarity import numpy as np from repoqa.provider.embeddings.base import BaseEmbeddingsProvider @@ -20,6 +19,16 @@ def find_best_match( ) -> str: all_texts = [ description ] + snippets embedded_texts = make_auto_embeddings_request(self.client, all_texts, self.model) - similarities = cosine_similarity([embedded_texts[0]], embedded_texts[1:])[0] - index = np.argmax(similarities) - return all_texts[index + 1] \ No newline at end of file + query_embedded = np.array(embedded_texts[0]) + max_similarity = 0 + max_sim_index = 0 + + query_norm = np.linalg.norm(query_embedded) + + for i in range(1, len(embedded_texts)): + similarity_score = (query_embedded @ np.array(embedded_texts[i])) / (query_norm * np.linalg.norm(embedded_texts[i])) # https://stackoverflow.com/questions/18424228/cosine-similarity-between-2-number-lists + if similarity_score > max_similarity: + max_similarity = similarity_score + max_sim_index = i + + return all_texts[max_sim_index] \ No newline at end of file diff --git a/repoqa/search_needle_function.py b/repoqa/search_needle_function.py index b7e48e3..38a5419 100644 --- a/repoqa/search_needle_function.py +++ b/repoqa/search_needle_function.py @@ -386,7 +386,7 @@ def evaluate_model( trust_remote_code: bool = False, attn_implementation=None, is_embedding: bool = False, - embedding_context_chunk_size: int = 30 + embedding_chunk_line_count: int = 30 ): if backend is None: if base_url is not None: @@ -592,8 +592,8 @@ def evaluate_model( prefix_lines = prefix.splitlines() suffix_lines = suffix.splitlines() - prefix_split = ["\n".join(prefix_lines[line:min(line + embedding_context_chunk_size, len(prefix_lines))]) for line in range(0, len(prefix_lines), embedding_context_chunk_size)] - suffix_split = ["\n".join(suffix_lines[line:min(line + embedding_context_chunk_size, len(suffix_lines))]) for line in range(0, len(suffix_lines), embedding_context_chunk_size)] + prefix_split = ["\n".join(prefix_lines[line:min(line + embedding_chunk_line_count, len(prefix_lines))]) for line in range(0, len(prefix_lines), embedding_chunk_line_count)] + suffix_split = ["\n".join(suffix_lines[line:min(line + embedding_chunk_line_count, len(suffix_lines))]) for line in range(0, len(suffix_lines), embedding_chunk_line_count)] snippets = prefix_split + [needle] + suffix_split snippets = [snippet for snippet in snippets if len(snippet.strip()) > 0] diff --git a/requirements.txt b/requirements.txt index e9c3b1f..2d18374 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,5 +11,4 @@ openai anthropic google-generativeai vllm -stop-sequencer -scikit-learn \ No newline at end of file +stop-sequencer \ No newline at end of file