Merge branch 'master' into grafana-dashboard-ts

Sheeproid · Sheeproid · commit 94241cb07a51 · 2025-10-27T10:18:24.000+02:00
diff --git a/README.md b/README.md
@@ -7,6 +7,8 @@ HolmesGPT is an AI agent for investigating problems in your cloud, finding the r
 
 Find more about HolmesGPT's maintainers and adopters [here](./ADOPTERS.md).
 
+📚 **[Read the full documentation at holmesgpt.dev](https://holmesgpt.dev/)** for installation guides, tutorials, API reference, and more.
+
   <p align="center">
     <a href="#how-it-works"><strong>How it Works</strong></a> |
     <a href="#installation"><strong>Installation</strong></a> |
diff --git a/holmes/core/supabase_dal.py b/holmes/core/supabase_dal.py
@@ -6,6 +6,7 @@
 import os
 import threading
 from datetime import datetime, timedelta
+from enum import Enum
 from typing import Dict, List, Optional, Tuple
 from uuid import uuid4
 
@@ -53,6 +54,11 @@
 ENRICHMENT_BLACKLIST_SET = set(ENRICHMENT_BLACKLIST)
 
 
+class FindingType(str, Enum):
+    ISSUE = "issue"
+    CONFIGURATION_CHANGE = "configuration_change"
+
+
 class RobustaToken(BaseModel):
     store_url: str
     api_key: str
@@ -237,14 +243,15 @@ def get_resource_recommendation(
             logging.exception("Supabase error while retrieving efficiency data")
             return None
 
-    def get_configuration_changes_metadata(
+    def get_issues_metadata(
         self,
         start_datetime: str,
         end_datetime: str,
         limit: int = 100,
         workload: Optional[str] = None,
         ns: Optional[str] = None,
         cluster: Optional[str] = None,
+        finding_type: FindingType = FindingType.CONFIGURATION_CHANGE,
     ) -> Optional[List[Dict]]:
         if not self.enabled:
             return []
@@ -265,12 +272,12 @@ def get_configuration_changes_metadata(
                 )
                 .eq("account_id", self.account_id)
                 .eq("cluster", cluster)
-                .eq("finding_type", "configuration_change")
                 .gte("creation_date", start_datetime)
                 .lte("creation_date", end_datetime)
                 .limit(limit)
             )
 
+            query = query.eq("finding_type", finding_type.value)
             if workload:
                 query.eq("subject_name", workload)
             if ns:
diff --git a/holmes/main.py b/holmes/main.py
@@ -1,18 +1,15 @@
 # ruff: noqa: E402
 import os
-import sys
 
 from holmes.utils.cert_utils import add_custom_certificate
-from holmes.utils.colors import USER_COLOR
 
 ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "")
 if add_custom_certificate(ADDITIONAL_CERTIFICATE):
     print("added custom certificate")
 
 # DO NOT ADD ANY IMPORTS OR CODE ABOVE THIS LINE
 # IMPORTING ABOVE MIGHT INITIALIZE AN HTTPS CLIENT THAT DOESN'T TRUST THE CUSTOM CERTIFICATE
-
-
+import sys
 import json
 import logging
 import socket
@@ -44,6 +41,7 @@
 from holmes.utils.console.logging import init_logging
 from holmes.utils.console.result import handle_result
 from holmes.utils.file_utils import write_json_file
+from holmes.utils.colors import USER_COLOR
 
 app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
 investigate_app = typer.Typer(
diff --git a/holmes/plugins/toolsets/robusta/robusta.py b/holmes/plugins/toolsets/robusta/robusta.py
@@ -4,7 +4,7 @@
 
 from typing import Optional, Dict, Any, List
 from holmes.common.env_vars import load_bool
-from holmes.core.supabase_dal import SupabaseDal
+from holmes.core.supabase_dal import SupabaseDal, FindingType
 from holmes.core.tools import (
     StaticPrerequisite,
     Tool,
@@ -168,7 +168,7 @@ def __init__(
                 required=True,
             ),
             END_TIME: ToolParameter(
-                description="The starting time boundary for the search period. String in RFC3339 format.",
+                description="The ending time boundary for the search period. String in RFC3339 format.",
                 type="string",
                 required=True,
             ),
@@ -188,7 +188,7 @@ def __init__(
                         required=False,
                     ),
                     "workload": ToolParameter(
-                        description="The kubernetes workload name for filtering configuration changes. Deployment name or Pod name for example.",
+                        description="Kubernetes resource name to filter configuration changes (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.",
                         type="string",
                         required=False,
                     ),
@@ -203,10 +203,13 @@ def __init__(
         self._dal = dal
 
     def _fetch_change_history(
-        self, params: Dict, cluster: Optional[str] = None
+        self,
+        params: Dict,
+        cluster: Optional[str] = None,
+        finding_type: FindingType = FindingType.CONFIGURATION_CHANGE,
     ) -> Optional[List[Dict]]:
         if self._dal and self._dal.enabled:
-            return self._dal.get_configuration_changes_metadata(
+            return self._dal.get_issues_metadata(
                 start_datetime=params["start_datetime"],
                 end_datetime=params["end_datetime"],
                 limit=min(
@@ -216,6 +219,7 @@ def _fetch_change_history(
                 ns=params.get("namespace"),
                 workload=params.get("workload"),
                 cluster=cluster,
+                finding_type=finding_type,
             )
         return None
 
@@ -231,7 +235,7 @@ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolRes
             else:
                 return StructuredToolResult(
                     status=StructuredToolResultStatus.NO_DATA,
-                    data=f"Could not find changes for {params}",
+                    data=f"{self.name} found no data. {params}",
                     params=params,
                 )
         except Exception as e:
@@ -254,7 +258,7 @@ def __init__(self, dal: Optional[SupabaseDal]):
             name="fetch_configuration_changes_metadata",
             description=(
                 "Fetch configuration changes metadata in a given time range. "
-                "By default, fetch all cluster changes. Can be filtered on a given namespace or a specific workload. "
+                "By default, fetch all cluster changes. Can be filtered on a given namespace or a specific kubernetes resource. "
                 "Use fetch_finding_by_id to get detailed change of one specific configuration change."
             ),
         )
@@ -285,6 +289,26 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
         return f"Robusta: Search External Change History {params}"
 
 
+class FetchResourceIssuesMetadata(FetchConfigurationChangesMetadataBase):
+    def __init__(self, dal: Optional[SupabaseDal]):
+        super().__init__(
+            dal=dal,
+            name="fetch_resource_issues_metadata",
+            description=(
+                "Fetch issues and alert metadata in a given time range. "
+                "Must be filtered on a given namespace and specific kubernetes resource such as pod, deployment, job, etc."
+                "Use fetch_finding_by_id to get further information on a specific issue or alert."
+            ),
+            add_cluster_filter=True,
+        )
+
+    def _fetch_resource_issues(self, params: Dict) -> Optional[List[Dict]]:  # type: ignore
+        return super()._fetch_change_history(params, finding_type=FindingType.ISSUE)
+
+    def get_parameterized_one_liner(self, params: Dict) -> str:
+        return f"Robusta: fetch resource issues metadata {params}"
+
+
 class RobustaToolset(Toolset):
     def __init__(self, dal: Optional[SupabaseDal]):
         dal_prereq = StaticPrerequisite(
@@ -300,6 +324,7 @@ def __init__(self, dal: Optional[SupabaseDal]):
             FetchRobustaFinding(dal),
             FetchConfigurationChangesMetadata(dal),
             FetchResourceRecommendation(dal),
+            FetchResourceIssuesMetadata(dal),
         ]
 
         if PULL_EXTERNAL_FINDINGS:
diff --git a/holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 b/holmes/plugins/toolsets/robusta/robusta_instructions.jinja2
@@ -3,6 +3,8 @@
 * If a change seems important to the investigation, Use fetch_finding_by_id with the configuration change ID to get full details of the change.
 * You must ALWAYS call fetch_configuration_changes_metadata when investigating an alert
 * Never respond without calling fetch_configuration_changes_metadata
+* When investigating a resource (pod, deployment, or job), if no relevant information is available from the live cluster at the time of investigation, call the fetch_resource_issues_metadata function to retrieve its historical alert data.
+* You can use fetch_resource_issues_metadata to get issues context for a specific kubernetes resource. Start with a 4 hours window and try to expand to 24 hours windows if nothing comes up.
 * When investigating an alert, look at historical configuration changes that happen 4 hours before the alert started
 * If you found a change that caused the alert, you MUST write: 'The issue was introduced by ...' with a short description of the change, and the date of it.
 For example:
diff --git a/server.py b/server.py
@@ -1,11 +1,6 @@
 # ruff: noqa: E402
-import json
 import os
-from typing import List, Optional
 
-import litellm
-import sentry_sdk
-from holmes import get_version, is_official_release
 from holmes.utils.cert_utils import add_custom_certificate
 
 ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "")
@@ -20,7 +15,12 @@
 import uvicorn
 import colorlog
 import time
+import json
+from typing import List, Optional
 
+import litellm
+import sentry_sdk
+from holmes import get_version, is_official_release
 from litellm.exceptions import AuthenticationError
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import StreamingResponse
diff --git a/tests/llm/fixtures/test_ask_holmes/93_calling_datadog/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/93_calling_datadog/toolsets.yaml
@@ -1,15 +1,9 @@
 toolsets:
-  prometheus/metrics:
-    enabled: False
   kubernetes/kube-lineage-extras:
     enabled: true
   kubernetes/logs:
     enabled: False
-  kubernetes/core:
-    enabled: true
   datadog/logs:
     enabled: True
   datadog/metrics:
     enabled: True
-  datadog/traces:
-    enabled: True
diff --git a/tests/test_app_imports.py b/tests/test_app_imports.py
@@ -0,0 +1,37 @@
+import os
+import pytest
+
+EXPECTED_LINES = [
+    "# ruff: noqa: E402\n",
+    "import os\n",
+    "\n",
+    "from holmes.utils.cert_utils import add_custom_certificate\n",
+    "\n",
+    'ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "")\n',
+    "if add_custom_certificate(ADDITIONAL_CERTIFICATE):\n",
+    '    print("added custom certificate")\n',
+    "\n",
+    "# DO NOT ADD ANY IMPORTS OR CODE ABOVE THIS LINE\n",
+    "# IMPORTING ABOVE MIGHT INITIALIZE AN HTTPS CLIENT THAT DOESN'T TRUST THE CUSTOM CERTIFICATE\n",
+]
+
+
+@pytest.mark.parametrize(
+    "file_path,file_name",
+    [
+        ("holmes/main.py", "main.py"),
+        ("server.py", "server.py"),
+        ("experimental/ag-ui/server-agui.py", "server-agui.py"),
+    ],
+)
+def test_app_files_have_correct_initial_lines(file_path, file_name):
+    """Test that app files start with the required certificate handling code."""
+    full_path = os.path.join(os.path.dirname(__file__), "..", file_path)
+
+    with open(full_path, "r") as f:
+        lines = f.readlines()
+
+    for i, expected_line in enumerate(EXPECTED_LINES):
+        assert (
+            lines[i] == expected_line
+        ), f"Line {i + 1} should be: {expected_line.strip()!r}, but got: {lines[i].strip()!r}. This tests make sure the import order in {file_name} file is correct, if you see this, go to {file_name} file and move your imports code to lower lines."