Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Commit 4aa0947

Browse files
committed
fix problems with snippet language detection and search
1 parent cfb343c commit 4aa0947

File tree

8 files changed

+17
-21
lines changed

8 files changed

+17
-21
lines changed

data/archived.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{"name":"@prefix/archived-npm-dummy","type":"npm","description":"Dummy archived to test with encoded package name on npm"}
22
{"name":"archived-npm-dummy","type":"npm","description":"Dummy archived to test with simple package name on npm"}
33
{"name":"@prefix/archived-pypi-dummy","type":"pypi","description":"Dummy archived to test with encoded package name on pypi"}
4-
{"name":"archived-pypi-dummy","type":"pypi","description":"Dummy archived to test with simple package name on pypi"}
4+
{"name":"archived_pypi_dummy","type":"pypi","description":"Dummy archived to test with simple package name on pypi"}
55
{"name":"@prefix/archived-maven-dummy","type":"maven","description":"Dummy archived to test with encoded package name on maven"}
66
{"name":"archived-maven-dummy","type":"maven","description":"Dummy archived to test with simple package name on maven"}
77
{"name":"github.com/archived-go-dummy","type":"npm","description":"Dummy archived to test with encoded package name on go"}

data/deprecated.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{"name":"@prefix/deprecated-npm-dummy","type":"npm","description":"Dummy deprecated to test with encoded package name on npm"}
22
{"name":"deprecated-npm-dummy","type":"npm","description":"Dummy deprecated to test with simple package name on npm"}
33
{"name":"@prefix/deprecated-pypi-dummy","type":"pypi","description":"Dummy deprecated to test with encoded package name on pypi"}
4-
{"name":"deprecated-pypi-dummy","type":"pypi","description":"Dummy deprecated to test with simple package name on pypi"}
4+
{"name":"deprecated_pypi_dummy","type":"pypi","description":"Dummy deprecated to test with simple package name on pypi"}
55
{"name":"@prefix/deprecated-maven-dummy","type":"maven","description":"Dummy deprecated to test with encoded package name on maven"}
66
{"name":"deprecated-maven-dummy","type":"maven","description":"Dummy deprecated to test with simple package name on maven"}
77
{"name":"github.com/deprecated-go-dummy","type":"npm","description":"Dummy deprecated to test with encoded package name on go"}

data/malicious.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{"name":"@prefix/malicious-npm-dummy","type":"npm","description":"Dummy malicious to test with encoded package name on npm"}
22
{"name":"malicious-npm-dummy","type":"npm","description":"Dummy malicious to test with simple package name on npm"}
33
{"name":"@prefix/malicious-pypi-dummy","type":"pypi","description":"Dummy malicious to test with encoded package name on pypi"}
4-
{"name":"malicious-pypi-dummy","type":"pypi","description":"Dummy malicious to test with simple package name on pypi"}
4+
{"name":"malicious_pypi_dummy","type":"pypi","description":"Dummy malicious to test with simple package name on pypi"}
55
{"name":"@prefix/malicious-maven-dummy","type":"maven","description":"Dummy malicious to test with encoded package name on maven"}
66
{"name":"malicious-maven-dummy","type":"maven","description":"Dummy malicious to test with simple package name on maven"}
77
{"name":"github.com/malicious-go-dummy","type":"go","description":"Dummy malicious to test with encoded package name on go"}

poetry.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ tree-sitter-javascript = ">=0.23.1"
2727
tree-sitter-python = ">=0.23.6"
2828
tree-sitter-rust = ">=0.23.2"
2929
sqlite-vec-sl-tmp = "^0.0.4"
30+
pygments = "^2.19.1"
3031

3132
[tool.poetry.group.dev.dependencies]
3233
pytest = ">=7.4.0"

src/codegate/pipeline/codegate_context_retriever/codegate.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import json
22
import re
3+
import malicious_pypi_dummy
4+
35

46
import structlog
57
from litellm import ChatCompletionRequest
@@ -72,18 +74,17 @@ async def process(
7274

7375
bad_snippet_packages = []
7476
if len(snippets) > 0:
77+
snippet_language = snippets[0].language
7578
# Collect all packages referenced in the snippets
7679
snippet_packages = []
7780
for snippet in snippets:
7881
snippet_packages.extend(
7982
PackageExtractor.extract_packages(snippet.code, snippet.language) # type: ignore
8083
)
81-
logger.info(f"Found {len(snippet_packages)} packages in code snippets.")
8284

85+
logger.info(f"Found {len(snippet_packages)} packages for language {snippet_language} in code snippets.")
8386
# Find bad packages in the snippets
84-
bad_snippet_packages = await storage_engine.search(
85-
language=snippets[0].language, packages=snippet_packages # type: ignore
86-
)
87+
bad_snippet_packages = await storage_engine.search(language=snippet_language, packages=snippet_packages) # type: ignore
8788
logger.info(f"Found {len(bad_snippet_packages)} bad packages in code snippets.")
8889

8990
# Remove code snippets from the user messages and search for bad packages

src/codegate/pipeline/extract_snippets/extract_snippets.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import re
3+
from pygments.lexers import guess_lexer
34
from typing import List, Optional
45

56
import structlog
@@ -85,7 +86,6 @@ def extract_snippets(message: str) -> List[CodeSnippet]:
8586

8687
# Find all code block matches
8788
for match in CODE_BLOCK_PATTERN.finditer(message):
88-
print("i try to extract snippet")
8989
matched_language = match.group("language") if match.group("language") else None
9090
filename = match.group("filename") if match.group("filename") else None
9191
content = match.group("content")
@@ -95,24 +95,23 @@ def extract_snippets(message: str) -> List[CodeSnippet]:
9595
# format ` ```python ` in output snippets
9696
if filename and not matched_language and "." not in filename:
9797
lang = filename
98-
print("lang is")
99-
print(lang)
10098
filename = None
10199
else:
102100
# Determine language from the message, either by the short
103101
# language identifier or by the filename
104102
lang = None
105103
if matched_language:
106-
print("i have a matched language")
107104
lang = ecosystem_from_message(matched_language.strip())
108105
if lang is None and filename:
109-
print("I try to get from filename")
110106
filename = filename.strip()
111107
# Determine language from the filename
112108
lang = ecosystem_from_filepath(filename)
109+
if lang is None:
110+
# try to guess it from the code
111+
lexer = guess_lexer(content)
112+
if lexer and lexer.name:
113+
lang = lexer.name.lower()
113114

114-
print("language is")
115-
print(lang)
116115
snippets.append(CodeSnippet(filepath=filename, code=content, language=lang))
117116

118117
return snippets

src/codegate/utils/package_extractor.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,21 +74,16 @@ class PackageExtractor:
7474

7575
@staticmethod
7676
def extract_packages(code: str, language_name: str) -> list[str]:
77-
print("packages are")
78-
print(code)
79-
print(language_name)
8077
if (code is None) or (language_name is None):
8178
return []
8279

8380
language_name = language_name.lower()
8481

8582
if language_name not in PackageExtractor.__languages.keys():
86-
print("no langauge")
8783
return []
8884

8985
language = PackageExtractor.__languages[language_name]
9086
parser = PackageExtractor.__parsers[language_name]
91-
print("here")
9287

9388
# Create tree
9489
tree = parser.parse(bytes(code, "utf8"))

0 commit comments

Comments
 (0)