diff --git a/src/codegate/extract_snippets/body_extractor.py b/src/codegate/extract_snippets/body_extractor.py index adc4f3a9..be0c1884 100644 --- a/src/codegate/extract_snippets/body_extractor.py +++ b/src/codegate/extract_snippets/body_extractor.py @@ -6,6 +6,7 @@ ClineCodeSnippetExtractor, CodeSnippetExtractor, DefaultCodeSnippetExtractor, + KoduCodeSnippetExtractor, OpenInterpreterCodeSnippetExtractor, ) @@ -39,6 +40,19 @@ def _extract_from_user_messages(self, data: dict) -> set[str]: filenames.extend(extracted_snippets.keys()) return set(filenames) + def _extract_from_list_user_messages(self, data: dict) -> set[str]: + filenames: List[str] = [] + for msg in data.get("messages", []): + if msg.get("role", "") == "user": + msgs_content = msg.get("content", []) + for msg_content in msgs_content: + if msg_content.get("type", "") == "text": + extracted_snippets = self._snippet_extractor.extract_unique_snippets( + msg_content.get("text") + ) + filenames.extend(extracted_snippets.keys()) + return set(filenames) + @abstractmethod def extract_unique_filenames(self, data: dict) -> set[str]: """ @@ -70,27 +84,8 @@ class ClineBodySnippetExtractor(BodyCodeSnippetExtractor): def __init__(self): self._snippet_extractor = ClineCodeSnippetExtractor() - def _extract_from_user_messages(self, data: dict) -> set[str]: - """ - The method extracts the code snippets from the user messages in the data got from Cline. - - It returns a set of filenames extracted from the code snippets. - """ - - filenames: List[str] = [] - for msg in data.get("messages", []): - if msg.get("role", "") == "user": - msgs_content = msg.get("content", []) - for msg_content in msgs_content: - if msg_content.get("type", "") == "text": - extracted_snippets = self._snippet_extractor.extract_unique_snippets( - msg_content.get("text") - ) - filenames.extend(extracted_snippets.keys()) - return set(filenames) - def extract_unique_filenames(self, data: dict) -> set[str]: - return self._extract_from_user_messages(data) + return self._extract_from_list_user_messages(data) class OpenInterpreterBodySnippetExtractor(BodyCodeSnippetExtractor): @@ -136,3 +131,12 @@ def extract_unique_filenames(self, data: dict) -> set[str]: ) filenames.extend(extracted_snippets.keys()) return set(filenames) + + +class KoduBodySnippetExtractor(BodyCodeSnippetExtractor): + + def __init__(self): + self._snippet_extractor = KoduCodeSnippetExtractor() + + def extract_unique_filenames(self, data: dict) -> set[str]: + return self._extract_from_list_user_messages(data) diff --git a/src/codegate/extract_snippets/factory.py b/src/codegate/extract_snippets/factory.py index 5f5f0231..000f1809 100644 --- a/src/codegate/extract_snippets/factory.py +++ b/src/codegate/extract_snippets/factory.py @@ -4,6 +4,7 @@ BodyCodeSnippetExtractor, ClineBodySnippetExtractor, ContinueBodySnippetExtractor, + KoduBodySnippetExtractor, OpenInterpreterBodySnippetExtractor, ) from codegate.extract_snippets.message_extractor import ( @@ -11,6 +12,7 @@ ClineCodeSnippetExtractor, CodeSnippetExtractor, DefaultCodeSnippetExtractor, + KoduCodeSnippetExtractor, OpenInterpreterCodeSnippetExtractor, ) @@ -24,6 +26,7 @@ def create_snippet_extractor(detected_client: ClientType) -> BodyCodeSnippetExtr ClientType.CLINE: ClineBodySnippetExtractor(), ClientType.AIDER: AiderBodySnippetExtractor(), ClientType.OPEN_INTERPRETER: OpenInterpreterBodySnippetExtractor(), + ClientType.KODU: KoduBodySnippetExtractor(), } return mapping_client_extractor.get(detected_client, ContinueBodySnippetExtractor()) @@ -37,5 +40,6 @@ def create_snippet_extractor(detected_client: ClientType) -> CodeSnippetExtracto ClientType.CLINE: ClineCodeSnippetExtractor(), ClientType.AIDER: AiderCodeSnippetExtractor(), ClientType.OPEN_INTERPRETER: OpenInterpreterCodeSnippetExtractor(), + ClientType.KODU: KoduCodeSnippetExtractor(), } return mapping_client_extractor.get(detected_client, DefaultCodeSnippetExtractor()) diff --git a/src/codegate/extract_snippets/message_extractor.py b/src/codegate/extract_snippets/message_extractor.py index e9a7c968..bea5a2f2 100644 --- a/src/codegate/extract_snippets/message_extractor.py +++ b/src/codegate/extract_snippets/message_extractor.py @@ -69,6 +69,13 @@ re.DOTALL, ) +KODU_CONTENT_PATTERN = re.compile( + r"[^\n>]+)\">" # Match the opening tag with path attribute + r"(?P.*?)" # Match the content (non-greedy) + r"", # Match the closing tag + re.DOTALL, +) + class MatchedPatternSnippet(BaseModel): """ @@ -343,3 +350,21 @@ def _get_match_pattern_snippet(self, match: re.Match) -> MatchedPatternSnippet: filename = match.group("filename") content = match.group("content") return MatchedPatternSnippet(language=matched_language, filename=filename, content=content) + + +class KoduCodeSnippetExtractor(CodeSnippetExtractor): + + @property + def codeblock_pattern(self) -> re.Pattern: + return [KODU_CONTENT_PATTERN] + + @property + def codeblock_with_filename_pattern(self) -> re.Pattern: + return [KODU_CONTENT_PATTERN] + + def _get_match_pattern_snippet(self, match: re.Match) -> MatchedPatternSnippet: + # We don't have language in the cline pattern + matched_language = None + filename = match.group("filename") + content = match.group("content") + return MatchedPatternSnippet(language=matched_language, filename=filename, content=content) diff --git a/tests/extract_snippets/test_body_extractor.py b/tests/extract_snippets/test_body_extractor.py index bc5738d4..1aa48bc7 100644 --- a/tests/extract_snippets/test_body_extractor.py +++ b/tests/extract_snippets/test_body_extractor.py @@ -5,6 +5,7 @@ from codegate.extract_snippets.body_extractor import ( ClineBodySnippetExtractor, ContinueBodySnippetExtractor, + KoduBodySnippetExtractor, OpenInterpreterBodySnippetExtractor, ) @@ -213,3 +214,72 @@ def test_body_extract_continue_snippets(test_case: BodyCodeSnippetTest): extractor = ContinueBodySnippetExtractor() filenames = extractor.extract_unique_filenames(test_case.input_body_dict) _evaluate_actual_filenames(filenames, test_case) + + +@pytest.mark.parametrize( + "test_case", + [ + # Analyze processed snippets from Kodu + BodyCodeSnippetTest( + input_body_dict={ + "messages": [ + {"role": "system", "content": "You are Kodu, an autonomous coding agent."}, + { + "role": "user", + "content": [ + { + "type": "text", + "text": """ +Here is our task for this conversation, you must remember it all time unless i tell you otherwise. + +please analyze + + - Super critical information, the files attached here are part of the task and need to be + - The URLs attached here need to be scrapped and the information should be used for the + - The files passed in context are provided to help you understand the task better, the + import invokehttp +import fastapi +from fastapi import FastAPI, Request, Response, HTTPException +import numpy + +GITHUB_TOKEN="ghp_1J9Z3Z2dfg4dfs23dsfsdf232aadfasdfasfasdf32" + +def add(a, b): + return a + b + +def multiply(a, b): + return a * b + + + +def substract(a, b): + + + + + + """, + } + ], + }, + { + "type": "text", + "text": """ +You must use a tool to proceed. Either use attempt_completion if you've completed the task, +or ask_followup_question if you need more information. you must adhere to the tool format +value1value2 +... additional parameters as needed in the same format +... +""", + }, + ] + }, + expected_count=1, + expected=["testing_file.py"], + ), + ], +) +def test_body_extract_kodu_snippets(test_case: BodyCodeSnippetTest): + extractor = KoduBodySnippetExtractor() + filenames = extractor.extract_unique_filenames(test_case.input_body_dict) + _evaluate_actual_filenames(filenames, test_case) diff --git a/tests/extract_snippets/test_message_extractor.py b/tests/extract_snippets/test_message_extractor.py index 07e4d8b3..db21896b 100644 --- a/tests/extract_snippets/test_message_extractor.py +++ b/tests/extract_snippets/test_message_extractor.py @@ -7,6 +7,7 @@ ClineCodeSnippetExtractor, CodeSnippet, DefaultCodeSnippetExtractor, + KoduCodeSnippetExtractor, OpenInterpreterCodeSnippetExtractor, ) @@ -714,6 +715,59 @@ def test_extract_openinterpreter_snippets(test_case: CodeSnippetTest): _evaluate_actual_snippets(snippets, test_case) +@pytest.mark.parametrize( + "test_case", + [ + # Analyze processed snippets from OpenInterpreter + CodeSnippetTest( + input_message=""" +Here is our task for this conversation, you must remember it all time unless i tell you otherwise. + +please analyze + + - Super critical information, the files attached here are part of the task and need to be + - The URLs attached here need to be scrapped and the information should be used for the + - The files passed in context are provided to help you understand the task better, the + import invokehttp +import fastapi +from fastapi import FastAPI, Request, Response, HTTPException +import numpy + +GITHUB_TOKEN="ghp_1J9Z3Z2dfg4dfs23dsfsdf232aadfasdfasfasdf32" + +def add(a, b): + return a + b + +def multiply(a, b): + return a * b + + + +def substract(a, b): + + + + + + """, + expected_count=1, + expected=[ + CodeSnippet( + language="python", + filepath="testing_file.py", + code="def multiply(a, b):", + file_extension=".py", + ), + ], + ), + ], +) +def test_extract_kodu_snippets(test_case: CodeSnippetTest): + extractor = KoduCodeSnippetExtractor() + snippets = extractor.extract_snippets(test_case.input_message, require_filepath=True) + _evaluate_actual_snippets(snippets, test_case) + + @pytest.mark.parametrize( "filepath,expected", [