stacklok
diff --git a/‎migrations/versions/2025_01_28_0915-0c3539f66339_add_token_usage_columns.py
Lines changed: 47 additions & 0 deletions b/‎migrations/versions/2025_01_28_0915-0c3539f66339_add_token_usage_columns.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎src/codegate/api/v1.py
Lines changed: 16 additions & 13 deletions b/‎src/codegate/api/v1.py
Lines changed: 16 additions & 13 deletions
diff --git a/‎src/codegate/api/v1_models.py
Lines changed: 15 additions & 47 deletions b/‎src/codegate/api/v1_models.py
Lines changed: 15 additions & 47 deletions
diff --git a/‎src/codegate/api/v1_processing.py
Lines changed: 50 additions & 28 deletions b/‎src/codegate/api/v1_processing.py
Lines changed: 50 additions & 28 deletions
@@ -0,0 +1,47 @@
+"""add token usage columns
+
+Revision ID: 0c3539f66339
+Revises: 0f9b8edc8e46
+Create Date: 2025-01-28 09:15:54.767311+00:00
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "0c3539f66339"
+down_revision: Union[str, None] = "0f9b8edc8e46"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Begin transaction
+    op.execute("BEGIN TRANSACTION;")
+
+    # We add the columns to the outputs table
+    # Add the columns with default values to avoid issues with the existing data
+    # The prices of the tokens may change in the future,
+    # so we need to store the cost of the tokens at the time of the request
+    op.execute("ALTER TABLE outputs ADD COLUMN input_tokens INT DEFAULT NULL;")
+    op.execute("ALTER TABLE outputs ADD COLUMN output_tokens INT DEFAULT NULL;")
+    op.execute("ALTER TABLE outputs ADD COLUMN input_cost FLOAT DEFAULT NULL;")
+    op.execute("ALTER TABLE outputs ADD COLUMN output_cost FLOAT DEFAULT NULL;")
+
+    # Finish transaction
+    op.execute("COMMIT;")
+
+
+def downgrade() -> None:
+    # Begin transaction
+    op.execute("BEGIN TRANSACTION;")
+
+    op.execute("ALTER TABLE outputs DROP COLUMN input_tokens;")
+    op.execute("ALTER TABLE outputs DROP COLUMN output_tokens;")
+    op.execute("ALTER TABLE outputs DROP COLUMN input_cost;")
+    op.execute("ALTER TABLE outputs DROP COLUMN output_cost;")
+
+    # Finish transaction
+    op.execute("COMMIT;")
@@ -479,16 +479,19 @@ def version_check():
 )
 async def get_workspace_token_usage(workspace_name: str) -> v1_models.TokenUsageAggregate:
     """Get the token usage of a workspace."""
-    # TODO: This is a dummy implementation. In the future, we should have a proper
-    # implementation that fetches the token usage from the database.
-    return v1_models.TokenUsageAggregate(
-        used_tokens=50,
-        tokens_by_model=[
-            v1_models.TokenUsageByModel(
-                provider_type="openai", model="gpt-4o-mini", used_tokens=20
-            ),
-            v1_models.TokenUsageByModel(
-                provider_type="anthropic", model="claude-3-5-sonnet-20241022", used_tokens=30
-            ),
-        ],
-    )
+
+    try:
+        ws = await wscrud.get_workspace_by_name(workspace_name)
+    except crud.WorkspaceDoesNotExistError:
+        raise HTTPException(status_code=404, detail="Workspace does not exist")
+    except Exception:
+        logger.exception("Error while getting workspace")
+        raise HTTPException(status_code=500, detail="Internal server error")
+
+    try:
+        prompts_outputs = await dbreader.get_prompts_with_output(ws.id)
+        ws_token_usage = await v1_processing.parse_workspace_token_usage(prompts_outputs)
+        return ws_token_usage
+    except Exception:
+        logger.exception("Error while getting messages")
+        raise HTTPException(status_code=500, detail="Internal server error")
@@ -3,15 +3,10 @@
 from typing import Any, Dict, List, Optional, Union
 
 import pydantic
-import requests
-from cachetools import TTLCache
 
 from codegate.db import models as db_models
 from codegate.pipeline.base import CodeSnippet
 
-# 1 day cache. Not keep all the models in the cache. Just the ones we have used recently.
-model_cost_cache = TTLCache(maxsize=2000, ttl=1 * 24 * 60 * 60)
-
 
 class Workspace(pydantic.BaseModel):
     name: str
@@ -118,46 +113,8 @@ class ProviderType(str, Enum):
     openai = "openai"
     anthropic = "anthropic"
     vllm = "vllm"
-
-
-class TokenUsage(pydantic.BaseModel):
-    input_tokens: int = 0
-    output_tokens: int = 0
-    input_cost: float = 0
-    output_cost: float = 0
-
-    @classmethod
-    def from_dict(cls, usage_dict: Dict) -> "TokenUsage":
-        return cls(
-            input_tokens=usage_dict.get("prompt_tokens", 0) or usage_dict.get("input_tokens", 0),
-            output_tokens=usage_dict.get("completion_tokens", 0)
-            or usage_dict.get("output_tokens", 0),
-            input_cost=0,
-            output_cost=0,
-        )
-
-    def __add__(self, other: "TokenUsage") -> "TokenUsage":
-        return TokenUsage(
-            input_tokens=self.input_tokens + other.input_tokens,
-            output_tokens=self.output_tokens + other.output_tokens,
-            input_cost=self.input_cost + other.input_cost,
-            output_cost=self.output_cost + other.output_cost,
-        )
-
-    def update_token_cost(self, model: str) -> None:
-        if not model_cost_cache:
-            model_cost = requests.get(
-                "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
-            )
-            model_cost_cache.update(model_cost.json())
-        model_cost = model_cost_cache.get(model, {})
-        input_cost_per_token = model_cost.get("input_cost_per_token", 0)
-        output_cost_per_token = model_cost.get("output_cost_per_token", 0)
-        self.input_cost = self.input_tokens * input_cost_per_token
-        self.output_cost = self.output_tokens * output_cost_per_token
-
-    def update_costs_based_on_model(self, model: str):
-        pass
+    llamacpp = "llamacpp"
+    ollama = "ollama"
 
 
 class TokenUsageByModel(pydantic.BaseModel):
@@ -167,7 +124,7 @@ class TokenUsageByModel(pydantic.BaseModel):
 
     provider_type: ProviderType
     model: str
-    token_usage: TokenUsage
+    token_usage: db_models.TokenUsage
 
 
 class TokenUsageAggregate(pydantic.BaseModel):
@@ -177,9 +134,20 @@ class TokenUsageAggregate(pydantic.BaseModel):
     """
 
     tokens_by_model: Dict[str, TokenUsageByModel]
-    token_usage: TokenUsage
+    token_usage: db_models.TokenUsage
 
     def add_model_token_usage(self, model_token_usage: TokenUsageByModel) -> None:
+        # Copilot doesn't have a model name and we cannot obtain the tokens used. Skip it.
+        if model_token_usage.model == "":
+            return
+
+        # Skip if the model has not used any tokens.
+        if (
+            model_token_usage.token_usage.input_tokens == 0
+            and model_token_usage.token_usage.output_tokens == 0
+        ):
+            return
+
         if model_token_usage.model in self.tokens_by_model:
             self.tokens_by_model[
                 model_token_usage.model
 
@@ -14,12 +14,11 @@
     PartialQuestionAnswer,
     PartialQuestions,
     QuestionAnswer,
-    TokenUsage,
     TokenUsageAggregate,
     TokenUsageByModel,
 )
 from codegate.db.connection import alert_queue
-from codegate.db.models import Alert, GetPromptWithOutputsRow
+from codegate.db.models import Alert, GetPromptWithOutputsRow, TokenUsage
 
 logger = structlog.get_logger("codegate")
 
@@ -103,55 +102,54 @@ async def parse_request(request_str: str) -> Tuple[Optional[List[str]], str]:
     return messages, model
 
 
-async def parse_output(output_str: str) -> Tuple[Optional[str], TokenUsage]:
+async def parse_output(output_str: str) -> Optional[str]:
     """
     Parse the output string from the pipeline and return the message.
     """
     try:
         if output_str is None:
-            return None, TokenUsage()
+            return None
 
         output = json.loads(output_str)
     except Exception as e:
         logger.warning(f"Error parsing output: {output_str}. {e}")
-        return None, TokenUsage()
+        return None
 
-    def _parse_single_output(single_output: dict) -> Tuple[str, TokenUsage]:
+    def _parse_single_output(single_output: dict) -> str:
         single_output_message = ""
         for choice in single_output.get("choices", []):
             if not isinstance(choice, dict):
                 continue
             content_dict = choice.get("delta", {}) or choice.get("message", {})
             single_output_message += content_dict.get("content", "")
-        return single_output_message, TokenUsage.from_dict(single_output.get("usage", {}))
+        return single_output_message
 
     full_output_message = ""
-    full_token_usage = TokenUsage()
     if isinstance(output, list):
         for output_chunk in output:
             output_message = ""
-            token_usage = TokenUsage()
             if isinstance(output_chunk, dict):
-                output_message, token_usage = _parse_single_output(output_chunk)
+                output_message = _parse_single_output(output_chunk)
             elif isinstance(output_chunk, str):
                 try:
                     output_decoded = json.loads(output_chunk)
-                    output_message, token_usage = _parse_single_output(output_decoded)
+                    output_message = _parse_single_output(output_decoded)
                 except Exception:
                     logger.error(f"Error reading chunk: {output_chunk}")
             else:
                 logger.warning(
                     f"Could not handle output: {output_chunk}", out_type=type(output_chunk)
                 )
             full_output_message += output_message
-            full_token_usage += token_usage
     elif isinstance(output, dict):
-        full_output_message, full_token_usage = _parse_single_output(output)
+        full_output_message = _parse_single_output(output)
 
-    return full_output_message, full_token_usage
+    return full_output_message
 
 
-async def _get_question_answer(row: GetPromptWithOutputsRow) -> Optional[PartialQuestionAnswer]:
+async def _get_partial_question_answer(
+    row: GetPromptWithOutputsRow,
+) -> Optional[PartialQuestionAnswer]:
     """
     Parse a row from the get_prompt_with_outputs query and return a PartialConversation
 
@@ -162,7 +160,7 @@ async def _get_question_answer(row: GetPromptWithOutputsRow) -> Optional[Partial
         output_task = tg.create_task(parse_output(row.output))
 
     request_user_msgs, model = request_task.result()
-    output_msg_str, token_usage = output_task.result()
+    output_msg_str = output_task.result()
 
     # If we couldn't parse the request, return None
     if not request_user_msgs:
@@ -184,8 +182,13 @@ async def _get_question_answer(row: GetPromptWithOutputsRow) -> Optional[Partial
     else:
         output_message = None
 
+    token_usage = TokenUsage.from_db(
+        input_cost=row.input_cost,
+        input_tokens=row.input_tokens,
+        output_tokens=row.output_tokens,
+        output_cost=row.output_cost,
+    )
     # Use the model to update the token cost
-    token_usage.update_token_cost(model)
     provider = row.provider
     # TODO: This should come from the database. For now, we are manually changing copilot to openai
     # Change copilot provider to openai
@@ -297,7 +300,8 @@ def _get_question_answer_from_partial(
     partial_question_answer: PartialQuestionAnswer,
 ) -> QuestionAnswer:
     """
-    Get a QuestionAnswer object from a PartialQuestionAnswer object.
+    Get a QuestionAnswer object from a PartialQuestionAnswer object. PartialQuestionAnswer
+    contains a list of messages as question. QuestionAnswer contains a single message as question.
     """
     # Get the last user message as the question
     question = ChatMessage(
@@ -315,11 +319,8 @@ async def match_conversations(
     """
     Match partial conversations to form a complete conversation.
     """
-    valid_partial_qas = [
-        partial_qas for partial_qas in partial_question_answers if partial_qas is not None
-    ]
     grouped_partial_questions = _group_partial_messages(
-        [partial_qs_a.partial_questions for partial_qs_a in valid_partial_qas]
+        [partial_qs_a.partial_questions for partial_qs_a in partial_question_answers]
     )
 
     # Create the conversation objects
@@ -333,7 +334,7 @@ async def match_conversations(
             # Partial questions don't contain the answer, so we need to find the corresponding
             # valid partial question answer
             selected_partial_qa = None
-            for partial_qa in valid_partial_qas:
+            for partial_qa in partial_question_answers:
                 if partial_question.message_id == partial_qa.partial_questions.message_id:
                     selected_partial_qa = partial_qa
                     break
@@ -367,17 +368,25 @@ async def match_conversations(
     return conversations, map_q_id_to_conversation
 
 
+async def _process_prompt_output_to_partial_qa(
+    prompts_outputs: List[GetPromptWithOutputsRow],
+) -> List[PartialQuestionAnswer]:
+    """
+    Process the prompts and outputs to PartialQuestionAnswer objects.
+    """
+    # Parse the prompts and outputs in parallel
+    async with asyncio.TaskGroup() as tg:
+        tasks = [tg.create_task(_get_partial_question_answer(row)) for row in prompts_outputs]
+    return [task.result() for task in tasks if task.result() is not None]
+
+
 async def parse_messages_in_conversations(
     prompts_outputs: List[GetPromptWithOutputsRow],
 ) -> Tuple[List[Conversation], Dict[str, Conversation]]:
     """
     Get all the messages from the database and return them as a list of conversations.
     """
-
-    # Parse the prompts and outputs in parallel
-    async with asyncio.TaskGroup() as tg:
-        tasks = [tg.create_task(_get_question_answer(row)) for row in prompts_outputs]
-    partial_question_answers = [task.result() for task in tasks]
+    partial_question_answers = await _process_prompt_output_to_partial_qa(prompts_outputs)
 
     conversations, map_q_id_to_conversation = await match_conversations(partial_question_answers)
     return conversations, map_q_id_to_conversation
@@ -430,3 +439,16 @@ async def parse_get_alert_conversation(
             for row in alerts
         ]
     return [task.result() for task in tasks if task.result() is not None]
+
+
+async def parse_workspace_token_usage(
+    prompts_outputs: List[GetPromptWithOutputsRow],
+) -> TokenUsageAggregate:
+    """
+    Parse the token usage from the workspace.
+    """
+    partial_question_answers = await _process_prompt_output_to_partial_qa(prompts_outputs)
+    token_usage_agg = TokenUsageAggregate(tokens_by_model={}, token_usage=TokenUsage())
+    for p_qa in partial_question_answers:
+        token_usage_agg.add_model_token_usage(p_qa.model_token_usage)
+    return token_usage_agg