Add RAG chunks in query response

Anxhela21 · Anxhela21 · commit ef94176582cd · 2025-09-16T12:41:51.000-04:00
Signed-off-by: Anxhela Coba &lt;acoba@redhat.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,11 +45,26 @@ dependencies = [
     "email-validator>=2.2.0",
     "openai==1.99.9",
     # Used by database interface
-    "sqlalchemy>=2.0.42",
+    "sqlalchemy>=2.0.41",
     # Used by Llama Stack version checker
     "semver<4.0.0",
     # Used by authorization resolvers
     "jsonpath-ng>=1.6.1",
+    "opentelemetry-sdk>=1.34.0",
+    "opentelemetry-exporter-otlp>=1.34.0",
+    "opentelemetry-instrumentation>=0.55b0",
+    "aiosqlite>=0.21.0",
+    "litellm>=1.72.1",
+    "blobfile>=3.0.0",
+    "datasets>=3.6.0",
+    "faiss-cpu>=1.11.0",
+    "mcp>=1.9.4",
+    "autoevals>=0.0.129",
+    "psutil>=7.0.0",
+    "torch>=2.7.1",
+    "peft>=0.15.2",
+    "trl>=0.18.2",
+    "sentence-transformers>=5.1.0",
 ]
 
 
diff --git a/run.yaml b/run.yaml
@@ -60,6 +60,9 @@ providers:
     provider_id: meta-reference
     provider_type: inline::meta-reference
   inference:
+    - provider_id: sentence-transformers # Can be any embedding provider
+      provider_type: inline::sentence-transformers
+      config: {}
     - provider_id: openai
       provider_type: remote::openai
       config:
@@ -99,14 +102,17 @@ providers:
     - provider_id: model-context-protocol
       provider_type: remote::model-context-protocol
       config: {}
+    - provider_id: rag-runtime
+      provider_type: inline::rag-runtime
+      config: {}
   vector_io:
   - config:
       kvstore:
-        db_path: .llama/distributions/ollama/faiss_store.db
+        db_path: /path/to/your/vector/store.db
         namespace: null
         type: sqlite
-    provider_id: faiss
-    provider_type: inline::faiss
+    provider_id: my_vector_db
+    provider_type: inline::faiss # Or prefered vector DB
 scoring_fns: []
 server:
   auth: null
@@ -117,10 +123,23 @@ server:
   tls_certfile: null
   tls_keyfile: null
 shields: []
-vector_dbs: []
-
+vector_dbs:
+  - vector_db_id: my_knowledge_base
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    provider_id: my_vector_db
 models:
+  - metadata:
+      embedding_dimension: 768 # Depends on chosen model
+    model_id: sentence-transformers/all-mpnet-base-v2 # Example model
+    provider_id: sentence-transformers
+    provider_model_id: path/to/model
+    model_type: embedding
   - model_id: gpt-4-turbo
     provider_id: openai
     model_type: llm
     provider_model_id: gpt-4-turbo
+
+tool_groups:
+  - toolgroup_id: builtin::rag
+    provider_id: rag-runtime
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -30,7 +30,7 @@
 from models.config import Action
 from models.database.conversations import UserConversation
 from models.requests import QueryRequest, Attachment
-from models.responses import QueryResponse, UnauthorizedResponse, ForbiddenResponse
+from models.responses import QueryResponse, UnauthorizedResponse, ForbiddenResponse, RAGChunk, ReferencedDocument, ToolCall
 from utils.endpoints import (
     check_configuration_loaded,
     get_agent,
@@ -243,17 +243,61 @@ async def query_endpoint_handler(
                 attachments=query_request.attachments or [],
             )
 
+        logger.info("Persisting conversation details...")
         persist_user_conversation_details(
             user_id=user_id,
             conversation_id=conversation_id,
             model=model_id,
             provider_id=provider_id,
         )
 
-        return QueryResponse(
+        # Convert tool calls and RAG chunks to response format
+        logger.info("Processing tool calls...")
+        tool_calls = [
+            ToolCall(
+                tool_name=tc.name,
+                arguments=tc.args if isinstance(tc.args, dict) else {"query": str(tc.args)},
+                result={"response": tc.response} if tc.response else None
+            )
+            for tc in summary.tool_calls
+        ]
+
+        
+        logger.info("Processing RAG chunks...")
+        rag_chunks = [
+            RAGChunk(
+                content=chunk.content,
+                source=chunk.source,
+                score=chunk.score
+            )
+            for chunk in summary.rag_chunks
+        ]
+        
+        # Extract referenced documents from RAG chunks
+        logger.info("Extracting referenced documents...")
+        referenced_docs = []
+        doc_sources = set()
+        for chunk in summary.rag_chunks:
+            if chunk.source and chunk.source not in doc_sources:
+                doc_sources.add(chunk.source)
+                referenced_docs.append(
+                    ReferencedDocument(
+                        url=chunk.source if chunk.source.startswith("http") else None,
+                        title=chunk.source,
+                        chunk_count=sum(1 for c in summary.rag_chunks if c.source == chunk.source)
+                    )
+                )
+
+        logger.info("Building final response...")
+        response = QueryResponse(
             conversation_id=conversation_id,
             response=summary.llm_response,
+            rag_chunks=rag_chunks if rag_chunks else None,
+            referenced_documents=referenced_docs if referenced_docs else None,
+            tool_calls=tool_calls if tool_calls else None,
         )
+        logger.info("Query processing completed successfully!")
+        return response
 
     # connection to Llama Stack server
     except APIConnectionError as e:
diff --git a/src/models/responses.py b/src/models/responses.py
@@ -1,6 +1,6 @@
 """Models for REST API responses."""
 
-from typing import Any, Optional
+from typing import Any, Optional, List
 
 from pydantic import BaseModel, Field
 
@@ -34,23 +34,45 @@ class ModelsResponse(BaseModel):
     )
 
 
-# TODO(lucasagomes): a lot of fields to add to QueryResponse. For now
-# we are keeping it simple. The missing fields are:
-# - referenced_documents: The optional URLs and titles for the documents used
-#   to generate the response.
-# - truncated: Set to True if conversation history was truncated to be within context window.
-# - input_tokens: Number of tokens sent to LLM
-# - output_tokens: Number of tokens received from LLM
-# - available_quotas: Quota available as measured by all configured quota limiters
-# - tool_calls: List of tool requests.
-# - tool_results: List of tool results.
-# See LLMResponse in ols-service for more details.
+class RAGChunk(BaseModel):
+    """Model representing a RAG chunk used in the response."""
+    
+    content: str = Field(description="The content of the chunk")
+    source: Optional[str] = Field(None, description="Source document or URL")
+    score: Optional[float] = Field(None, description="Relevance score")
+
+
+class ReferencedDocument(BaseModel):
+    """Model representing a document referenced in the response."""
+    
+    url: Optional[str] = Field(None, description="URL of the document")
+    title: Optional[str] = Field(None, description="Title of the document")
+    chunk_count: Optional[int] = Field(None, description="Number of chunks from this document")
+
+
+class ToolCall(BaseModel):
+    """Model representing a tool call made during response generation."""
+    
+    tool_name: str = Field(description="Name of the tool called")
+    arguments: dict[str, Any] = Field(description="Arguments passed to the tool")
+    result: Optional[dict[str, Any]] = Field(None, description="Result from the tool")
+
+
 class QueryResponse(BaseModel):
     """Model representing LLM response to a query.
 
     Attributes:
         conversation_id: The optional conversation ID (UUID).
         response: The response.
+        rag_chunks: List of RAG chunks used to generate the response.
+        referenced_documents: List of documents referenced in the response.
+        tool_calls: List of tool calls made during response generation.
+        TODO: truncated: Whether conversation history was truncated.
+        TODO: input_tokens: Number of tokens sent to LLM.
+        TODO: output_tokens: Number of tokens received from LLM.
+        TODO: available_quotas: Quota available as measured by all configured quota limiters
+        TODO: tool_results: List of tool results.
+
     """
 
     conversation_id: Optional[str] = Field(
@@ -66,13 +88,48 @@ class QueryResponse(BaseModel):
         ],
     )
 
+    rag_chunks: Optional[List[RAGChunk]] = Field(
+        None,
+        description="List of RAG chunks used to generate the response",
+    )
+
+    referenced_documents: Optional[List[ReferencedDocument]] = Field(
+        None,
+        description="List of documents referenced in the response",
+    )
+
+    tool_calls: Optional[List[ToolCall]] = Field(
+        None,
+        description="List of tool calls made during response generation",
+    )
     # provides examples for /docs endpoint
     model_config = {
         "json_schema_extra": {
             "examples": [
                 {
                     "conversation_id": "123e4567-e89b-12d3-a456-426614174000",
                     "response": "Operator Lifecycle Manager (OLM) helps users install...",
+                    "rag_chunks": [
+                        {
+                            "content": "OLM is a component of the Operator Framework toolkit...",
+                            "source": "kubernetes-docs/operators.md",
+                            "score": 0.95
+                        }
+                    ],
+                    "referenced_documents": [
+                        {
+                            "url": "https://kubernetes.io/docs/concepts/extend-kubernetes/operator/",
+                            "title": "Operator Pattern",
+                            "chunk_count": 2
+                        }
+                    ],
+                    "tool_calls": [
+                        {
+                            "tool_name": "knowledge_search",
+                            "arguments": {"query": "operator lifecycle manager"},
+                            "result": {"chunks_found": 5}
+                        }
+                    ],
                 }
             ]
         }
diff --git a/src/utils/types.py b/src/utils/types.py
@@ -1,7 +1,7 @@
 """Common types for the project."""
 
-from typing import Any, Optional
-
+from typing import Any, Optional, List
+import json
 from llama_stack_client.lib.agents.event_logger import interleaved_content_as_str
 from llama_stack_client.lib.agents.tool_parser import ToolParser
 from llama_stack_client.types.shared.completion_message import CompletionMessage
@@ -56,23 +56,86 @@ class ToolCallSummary(BaseModel):
     response: str | None
 
 
+class RAGChunkData(BaseModel):
+    """RAG chunk data extracted from tool responses."""
+    
+    content: str
+    source: Optional[str] = None
+    score: Optional[float] = None
+
+
 class TurnSummary(BaseModel):
     """Summary of a turn in llama stack."""
 
     llm_response: str
     tool_calls: list[ToolCallSummary]
+    rag_chunks: List[RAGChunkData] = []
 
     def append_tool_calls_from_llama(self, tec: ToolExecutionStep) -> None:
         """Append the tool calls from a llama tool execution step."""
         calls_by_id = {tc.call_id: tc for tc in tec.tool_calls}
         responses_by_id = {tc.call_id: tc for tc in tec.tool_responses}
         for call_id, tc in calls_by_id.items():
             resp = responses_by_id.get(call_id)
+            response_content = interleaved_content_as_str(resp.content) if resp else None
+            
             self.tool_calls.append(
                 ToolCallSummary(
                     id=call_id,
                     name=tc.tool_name,
                     args=tc.arguments,
-                    response=interleaved_content_as_str(resp.content) if resp else None,
+                    response=response_content,
                 )
             )
+            
+            # Extract RAG chunks from knowledge_search tool responses
+            if tc.tool_name == "knowledge_search" and resp and response_content:
+                self._extract_rag_chunks_from_response(response_content)
+    
+    def _extract_rag_chunks_from_response(self, response_content: str) -> None:
+        """Extract RAG chunks from tool response content."""
+        try:
+            # Parse the response to get chunks
+            # Try JSON first
+            try:
+                data = json.loads(response_content)
+                if isinstance(data, dict) and "chunks" in data:
+                    for chunk in data["chunks"]:
+                        self.rag_chunks.append(
+                            RAGChunkData(
+                                content=chunk.get("content", ""),
+                                source=chunk.get("source"),
+                                score=chunk.get("score")
+                            )
+                        )
+                elif isinstance(data, list):
+                    # Handle list of chunks
+                    for chunk in data:
+                        if isinstance(chunk, dict):
+                            self.rag_chunks.append(
+                                RAGChunkData(
+                                    content=chunk.get("content", str(chunk)),
+                                    source=chunk.get("source"),
+                                    score=chunk.get("score")
+                                )
+                            )
+            except json.JSONDecodeError:
+                # If not JSON, treat the entire response as a single chunk
+                if response_content.strip():
+                    self.rag_chunks.append(
+                        RAGChunkData(
+                            content=response_content,
+                            source="knowledge_search",
+                            score=None
+                        )
+                    )
+        except Exception:
+            # Treat response as single chunk
+            if response_content.strip():
+                self.rag_chunks.append(
+                    RAGChunkData(
+                        content=response_content,
+                        source="knowledge_search",
+                        score=None
+                    )
+                )
diff --git a/uv.lock b/uv.lock