Skip to content

Commit 3d26f2c

Browse files
author
Luke Hinds
committed
Implement PII
Implementation of story #394 We use SPacy Industrial-strength Natural Language Processing (NLP) model and the Presidio to discover PII within code snippets and chat completions Note this is WIP staged early to allow UI work to start There are still some issues specific to CoPilot I need to fix along with improving the delivery mechanisms (pulling down the spacy model)
1 parent e05f49a commit 3d26f2c

File tree

14 files changed

+2285
-67
lines changed

14 files changed

+2285
-67
lines changed

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ COPY pyproject.toml poetry.lock* /app/
1818

1919
# Configure Poetry and install dependencies
2020
RUN poetry config virtualenvs.create false && \
21-
poetry install --no-dev
21+
poetry install --no-dev && \
22+
python -m spacy download en_core_web_sm
2223

2324
# Copy the rest of the application
2425
COPY . /app
Binary file not shown.

poetry.lock

Lines changed: 1113 additions & 62 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

prompts/default.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ secrets_redacted: |
3838
about any tokens, passwords or similar sensitive information in the context whose value begins with
3939
the string "REDACTED".
4040
41+
pii_redacted: |
42+
The context files contain redacted personally identifiable information (PII) that is represented by a UUID encased within <>. For example:
43+
- <123e4567-e89b-12d3-a456-426614174000>
44+
- <2d040296-98e9-4350-84be-fda4336057eb>
45+
If you encounter any PII redacted with a UUID, DO NOT WARN the user about it. Simplt respond to the user request and keep the PII redacted and intact, using the same UUID.
46+
4147
# Security-focused prompts
4248
security_audit: "You are a security expert conducting a thorough code review. Identify potential security vulnerabilities, suggest improvements, and explain security best practices."
4349

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ sqlalchemy = "==2.0.37"
1919
aiosqlite = "==0.20.0"
2020
ollama = "==0.4.7"
2121
pydantic-settings = "==2.7.1"
22-
numpy = "==2.2.2"
22+
numpy = "==1.26.4"
2323
tree-sitter = "==0.24.0"
2424
tree-sitter-go = "==0.23.4"
2525
tree-sitter-java = "==0.23.5"
@@ -32,6 +32,9 @@ sqlite-vec-sl-tmp = "==0.0.4"
3232
greenlet = "==3.1.1"
3333
cachetools = "==5.5.1"
3434

35+
presidio = "^0.1.0"
36+
presidio-analyzer = "^2.2.357"
37+
presidio-anonymizer = "^2.2.357"
3538
[tool.poetry.group.dev.dependencies]
3639
pytest = "==8.3.4"
3740
pytest-cov = "==6.0.0"

src/codegate/codegate_logging.py

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,47 @@ def _missing_(cls, value: str) -> Optional["LogFormat"]:
4848
)
4949

5050

51+
# Define all LiteLLM logger names
52+
LITELLM_LOGGERS = ["LiteLLM Proxy", "LiteLLM Router", "LiteLLM"]
53+
54+
55+
def configure_litellm_logging(enabled: bool = False, level: LogLevel = LogLevel.INFO) -> None:
56+
"""Configure LiteLLM logging.
57+
58+
Args:
59+
enabled: Whether to enable LiteLLM logging
60+
level: Log level to use if enabled
61+
"""
62+
# Configure the main litellm logger
63+
logger = logging.getLogger("litellm")
64+
logger.disabled = not enabled
65+
if not enabled:
66+
logger.setLevel(logging.CRITICAL + 1) # Effectively disables all logging
67+
else:
68+
logger.setLevel(getattr(logging, level.value))
69+
logger.propagate = False
70+
# Clear any existing handlers
71+
logger.handlers.clear()
72+
# Add a handler to ensure logs are properly routed
73+
handler = logging.StreamHandler()
74+
handler.setLevel(getattr(logging, level.value))
75+
logger.addHandler(handler)
76+
77+
# Also configure the specific LiteLLM loggers
78+
for logger_name in LITELLM_LOGGERS:
79+
logger = logging.getLogger(logger_name)
80+
logger.disabled = not enabled
81+
if not enabled:
82+
logger.setLevel(logging.CRITICAL + 1)
83+
else:
84+
logger.setLevel(getattr(logging, level.value))
85+
logger.propagate = False
86+
logger.handlers.clear()
87+
handler = logging.StreamHandler()
88+
handler.setLevel(getattr(logging, level.value))
89+
logger.addHandler(handler)
90+
91+
5192
def add_origin(logger, log_method, event_dict):
5293
# Add 'origin' if it's bound to the logger but not explicitly in the event dict
5394
if "origin" not in event_dict and hasattr(logger, "_context"):
@@ -58,13 +99,17 @@ def add_origin(logger, log_method, event_dict):
5899

59100

60101
def setup_logging(
61-
log_level: Optional[LogLevel] = None, log_format: Optional[LogFormat] = None
102+
log_level: Optional[LogLevel] = None,
103+
log_format: Optional[LogFormat] = None,
104+
external_loggers: Optional[Dict[str, bool]] = None,
62105
) -> logging.Logger:
63106
"""Configure the logging system.
64107
65108
Args:
66109
log_level: The logging level to use. Defaults to INFO if not specified.
67110
log_format: The log format to use. Defaults to JSON if not specified.
111+
external_loggers: Dictionary of external logger names and whether they should be enabled.
112+
e.g. {"litellm": False, "sqlalchemy": False, "uvicorn.error": False}
68113
69114
This configures two handlers:
70115
- stderr_handler: For ERROR, CRITICAL, and WARNING messages
@@ -74,6 +119,16 @@ def setup_logging(
74119
log_level = LogLevel.INFO
75120
if log_format is None:
76121
log_format = LogFormat.JSON
122+
if external_loggers is None:
123+
external_loggers = {
124+
"litellm": False,
125+
"sqlalchemy": False,
126+
"uvicorn.error": False,
127+
"aiosqlite": False,
128+
}
129+
130+
# Configure LiteLLM logging based on external_loggers setting
131+
configure_litellm_logging(enabled=external_loggers.get("litellm", False), level=log_level)
77132

78133
# The configuration was taken from structlog documentation
79134
# https://www.structlog.org/en/stable/standard-library.html

src/codegate/pipeline/factory.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
from codegate.pipeline.extract_snippets.extract_snippets import CodeSnippetExtractor
88
from codegate.pipeline.extract_snippets.output import CodeCommentStep
99
from codegate.pipeline.output import OutputPipelineProcessor, OutputPipelineStep
10+
from codegate.pipeline.pii.pii import (
11+
CodegatePii,
12+
PiiRedactionNotifier,
13+
PiiUnRedactionStep,
14+
)
1015
from codegate.pipeline.secrets.manager import SecretsManager
1116
from codegate.pipeline.secrets.secrets import (
1217
CodegateSecrets,
@@ -22,11 +27,12 @@ def __init__(self, secrets_manager: SecretsManager):
2227

2328
def create_input_pipeline(self) -> SequentialPipelineProcessor:
2429
input_steps: List[PipelineStep] = [
25-
# make sure that this step is always first in the pipeline
30+
# make sure that these steps are always first in the pipeline
2631
# the other steps might send the request to a LLM for it to be analyzed
27-
# and without obfuscating the secrets, we'd leak the secrets during those
32+
# and without obfuscating the secrets/PII, we'd leak them during those
2833
# later steps
2934
CodegateSecrets(),
35+
CodegatePii(),
3036
CodegateCli(),
3137
CodeSnippetExtractor(),
3238
CodegateContextRetriever(),
@@ -37,13 +43,16 @@ def create_input_pipeline(self) -> SequentialPipelineProcessor:
3743
def create_fim_pipeline(self) -> SequentialPipelineProcessor:
3844
fim_steps: List[PipelineStep] = [
3945
CodegateSecrets(),
46+
CodegatePii(),
4047
]
4148
return SequentialPipelineProcessor(fim_steps, self.secrets_manager, is_fim=True)
4249

4350
def create_output_pipeline(self) -> OutputPipelineProcessor:
4451
output_steps: List[OutputPipelineStep] = [
4552
SecretRedactionNotifier(),
53+
PiiRedactionNotifier(),
4654
SecretUnredactionStep(),
55+
PiiUnRedactionStep(),
4756
CodeCommentStep(),
4857
]
4958
return OutputPipelineProcessor(output_steps)

src/codegate/pipeline/pii/analyzer.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import uuid
2+
from typing import Any, Dict, List, Tuple
3+
4+
from presidio_analyzer import AnalyzerEngine
5+
from presidio_anonymizer import AnonymizerEngine
6+
7+
8+
class PiiSessionStore:
9+
"""
10+
A class to manage PII (Personally Identifiable Information) session storage.
11+
12+
Attributes:
13+
session_id (str): The unique identifier for the session. If not provided, a new UUID
14+
is generated. mappings (Dict[str, str]): A dictionary to store mappings between UUID
15+
placeholders and PII.
16+
17+
Methods:
18+
add_mapping(pii: str) -> str:
19+
Adds a PII string to the session store and returns a UUID placeholder for it.
20+
21+
get_pii(uuid_placeholder: str) -> str:
22+
Retrieves the PII string associated with the given UUID placeholder. If the placeholder
23+
is not found, returns the placeholder itself.
24+
"""
25+
26+
def __init__(self, session_id: str = None):
27+
self.session_id = session_id or str(uuid.uuid4())
28+
self.mappings: Dict[str, str] = {}
29+
30+
def add_mapping(self, pii: str) -> str:
31+
uuid_placeholder = f"<{str(uuid.uuid4())}>"
32+
self.mappings[uuid_placeholder] = pii
33+
return uuid_placeholder
34+
35+
def get_pii(self, uuid_placeholder: str) -> str:
36+
return self.mappings.get(uuid_placeholder, uuid_placeholder)
37+
38+
39+
class PiiAnalyzer:
40+
"""
41+
PiiAnalyzer class for analyzing and anonymizing text containing PII.
42+
Methods:
43+
__init__:
44+
Initializes the PiiAnalyzer with a custom NLP engine configuration.
45+
analyze:
46+
text (str): The text to analyze for PII.
47+
Tuple[str, List[Dict[str, Any]], PiiSessionStore]: The anonymized text, a list of
48+
found PII details, and the session store.
49+
entities (List[str]): The PII entities to analyze for.
50+
51+
restore_pii:
52+
anonymized_text (str): The text with anonymized PII.
53+
session_store (PiiSessionStore): The PiiSessionStore used for anonymization.
54+
str: The text with original PII restored.
55+
"""
56+
57+
def __init__(self):
58+
import os
59+
60+
from presidio_analyzer.nlp_engine import NlpEngineProvider
61+
62+
# Get the path to our custom spacy config
63+
current_dir = os.path.dirname(os.path.abspath(__file__))
64+
config_path = os.path.join(current_dir, "spacy_config.yaml")
65+
66+
# Initialize the NLP engine with our custom configuration
67+
provider = NlpEngineProvider(conf_file=config_path)
68+
nlp_engine = provider.create_engine()
69+
70+
# Create analyzer with custom NLP engine
71+
self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
72+
self.anonymizer = AnonymizerEngine()
73+
self.session_store = PiiSessionStore()
74+
75+
def analyze(self, text: str) -> Tuple[str, List[Dict[str, Any]], PiiSessionStore]:
76+
entities = [
77+
"PHONE_NUMBER",
78+
"EMAIL_ADDRESS",
79+
"CREDIT_CARD",
80+
"CRYPTO",
81+
"IBAN_CODE",
82+
"IP_ADDRESS",
83+
"NRP",
84+
"MEDICAL_LICENSE",
85+
"US_BANK_NUMBER",
86+
"US_DRIVER_LICENSE",
87+
"US_ITIN",
88+
"US_PASSPORT",
89+
"US_SSN",
90+
"UK_NHS",
91+
"UK_NINO",
92+
]
93+
94+
# Analyze the text for PII
95+
analyzer_results = self.analyzer.analyze(text=text, entities=entities, language="en")
96+
97+
# Track found PII
98+
found_pii = []
99+
100+
# Only anonymize if PII was found
101+
if analyzer_results:
102+
# Log each found PII instance and anonymize
103+
anonymized_text = text
104+
for result in analyzer_results:
105+
pii_value = text[result.start : result.end]
106+
uuid_placeholder = self.session_store.add_mapping(pii_value)
107+
pii_info = {
108+
"type": result.entity_type,
109+
"value": pii_value,
110+
"score": result.score,
111+
"start": result.start,
112+
"end": result.end,
113+
"uuid_placeholder": uuid_placeholder,
114+
}
115+
found_pii.append(pii_info)
116+
anonymized_text = anonymized_text.replace(pii_value, uuid_placeholder)
117+
118+
# Return the anonymized text, PII details, and session store
119+
return anonymized_text, found_pii, self.session_store
120+
121+
# If no PII found, return original text, empty list, and session store
122+
return text, [], self.session_store
123+
124+
def restore_pii(self, anonymized_text: str, session_store: PiiSessionStore) -> str:
125+
"""
126+
Restore the original PII (Personally Identifiable Information) in the given anonymized text.
127+
128+
This method replaces placeholders in the anonymized text with their corresponding original
129+
PII values using the mappings stored in the provided PiiSessionStore.
130+
131+
Args:
132+
anonymized_text (str): The text containing placeholders for PII.
133+
session_store (PiiSessionStore): The session store containing mappings of placeholders
134+
to original PII.
135+
136+
Returns:
137+
str: The text with the original PII restored.
138+
"""
139+
for uuid_placeholder, original_pii in session_store.mappings.items():
140+
anonymized_text = anonymized_text.replace(uuid_placeholder, original_pii)
141+
return anonymized_text

src/codegate/pipeline/pii/manager.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from typing import Any, Dict, List, Tuple
2+
3+
import structlog
4+
5+
from codegate.pipeline.pii.analyzer import PiiAnalyzer, PiiSessionStore
6+
7+
logger = structlog.get_logger("codegate")
8+
9+
10+
class PiiManager:
11+
"""
12+
Manages the analysis and restoration of Personally Identifiable Information (PII) in text.
13+
14+
Attributes:
15+
analyzer (PiiAnalyzer): An instance of PiiAnalyzer used for PII detection and restoration.
16+
current_session (PiiSessionStore): Stores the current PII session information.
17+
18+
Methods:
19+
__init__():
20+
Initializes the PiiManager with a PiiAnalyzer instance and sets the
21+
current session to None.
22+
23+
analyze(text: str) -> Tuple[str, List[Dict[str, Any]]]:
24+
Analyzes the given text for PII, anonymizes it, and logs the detected PII details.
25+
Args:
26+
text (str): The text to be analyzed for PII.
27+
Returns:
28+
Tuple[str, List[Dict[str, Any]]]: A tuple containing the anonymized text and
29+
a list of found PII details.
30+
31+
restore_pii(anonymized_text: str) -> str:
32+
Restores the PII in the given anonymized text using the current session.
33+
Args:
34+
anonymized_text (str): The text with anonymized PII to be restored.
35+
Returns:
36+
str: The text with restored PII.
37+
"""
38+
39+
def __init__(self):
40+
self.analyzer = PiiAnalyzer()
41+
self.current_session: PiiSessionStore = None
42+
43+
def analyze(self, text: str) -> Tuple[str, List[Dict[str, Any]]]:
44+
anonymized_text, found_pii, self.current_session = self.analyzer.analyze(text)
45+
46+
# Log found PII details
47+
if found_pii:
48+
for pii in found_pii:
49+
logger.info(
50+
"PII detected",
51+
pii_type=pii["type"],
52+
value="*" * len(pii["value"]), # Don't log actual value
53+
score=f"{pii['score']:.2f}",
54+
)
55+
56+
return anonymized_text, found_pii
57+
58+
def restore_pii(self, anonymized_text: str) -> str:
59+
if self.current_session is None:
60+
logger.warning("No active PII session found. Unable to restore PII.")
61+
return anonymized_text
62+
logger.debug("Restoring PII from session.")
63+
logger.debug(f"Current session: {self.current_session}")
64+
logger.debug(f"Anonymized text: {anonymized_text}")
65+
restored_text = self.analyzer.restore_pii(anonymized_text, self.current_session)
66+
logger.debug(f"Restored text: {restored_text}")
67+
return restored_text

0 commit comments

Comments
 (0)