cubic-dev-ai comments

sunnymodi21 · sunnymodi21 · commit 425f0f3beac8 · 2025-11-02T21:28:59.000-06:00
diff --git a/README.md b/README.md
@@ -55,7 +55,13 @@ Activate the virtual environment:
 source .venv/bin/activate
 ```
 
-#### Step 3: Configure Environment
+#### Step 3: Install Python Packages
+Install the required Python packages using uv:
+```bash
+uv pip install -r requirements.txt
+```
+
+#### Step 4: Configure Environment
 1. Create a copy of the example environment file:
 - Windows (Command Prompt):
 ```bash
@@ -67,7 +73,7 @@ cp .env.example .env
 ```
 2. Open `.env` in your preferred text editor and add your API keys and other settings
 
-#### Step 4: Enjoy the web-ui
+#### Step 5: Enjoy the web-ui
 1.  **Run the WebUI:**
     ```bash
     python webui.py --ip 127.0.0.1 --port 7788
diff --git a/src/browser/browser_compat.py b/src/browser/browser_compat.py
@@ -45,6 +45,17 @@ def __init__(self,
         self.save_recording_path = save_recording_path
         self.save_downloads_path = save_downloads_path
         self._extra = kwargs
+    
+    def model_dump(self) -> Dict[str, Any]:
+        """Compatibility method for pydantic model_dump"""
+        return {
+            'window_width': self.window_width,
+            'window_height': self.window_height,
+            'trace_path': self.trace_path,
+            'save_recording_path': self.save_recording_path,
+            'save_downloads_path': self.save_downloads_path,
+            **self._extra
+        }
 
 class BrowserState:
     """Compatibility shim for BrowserState"""
diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py
@@ -5,9 +5,19 @@
 from browser_use.browser.profile import BrowserProfile
 import logging
 
-# Chrome args and utils imports need to be updated for browser_use 0.6.0
-# These modules have been restructured in the new version
+# Updated imports for browser_use 0.6.0
+from browser_use.browser.profile import (
+    CHROME_DEFAULT_ARGS,
+    CHROME_HEADLESS_ARGS,
+    CHROME_DOCKER_ARGS,
+    CHROME_DISABLE_SECURITY_ARGS,
+    CHROME_DETERMINISTIC_RENDERING_ARGS,
+    get_display_size,
+    get_window_adjustments,
+)
+from browser_use.config import CONFIG
 from browser_use.utils import time_execution_async
+from .browser_compat import BrowserContextConfig
 import socket
 
 from .custom_context import CustomBrowserContext
@@ -44,13 +54,14 @@ async def _setup_builtin_browser(self, playwright_instance) -> object:
             screen_size = {'width': 1920, 'height': 1080}
             offset_x, offset_y = 0, 0
         else:
-            screen_size = get_screen_resolution()
+            display_size = get_display_size()
+            screen_size = {'width': display_size.width, 'height': display_size.height} if display_size else {'width': 1920, 'height': 1080}
             offset_x, offset_y = get_window_adjustments()
 
         chrome_args = {
             f'--remote-debugging-port={self.config.chrome_remote_debugging_port}',
-            *CHROME_ARGS,
-            *(CHROME_DOCKER_ARGS if IN_DOCKER else []),
+            *CHROME_DEFAULT_ARGS,
+            *(CHROME_DOCKER_ARGS if CONFIG.IN_DOCKER else []),
             *(CHROME_HEADLESS_ARGS if self.config.headless else []),
             *(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
             *(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
diff --git a/src/utils/llm_provider.py b/src/utils/llm_provider.py
@@ -100,14 +100,16 @@ def get_llm_model(provider: str, **kwargs) -> BaseChatModel:
         )
     
     # For providers not directly supported by browser-use, use OpenAI-compatible API
-    elif provider in ["grok", "alibaba", "moonshot", "unbound", "siliconflow", "modelscope"]:
+    elif provider in ["grok", "alibaba", "moonshot", "unbound", "siliconflow", "modelscope", "mistral", "ibm"]:
         base_url_map = {
             "grok": os.getenv("GROK_ENDPOINT", "https://api.x.ai/v1"),
             "alibaba": os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
             "moonshot": os.getenv("MOONSHOT_ENDPOINT"),
             "unbound": os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
             "siliconflow": os.getenv("SILICONFLOW_ENDPOINT", ""),
-            "modelscope": os.getenv("MODELSCOPE_ENDPOINT", "")
+            "modelscope": os.getenv("MODELSCOPE_ENDPOINT", ""),
+            "mistral": os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1"),
+            "ibm": os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com")
         }
         
         model_defaults = {
@@ -116,19 +118,29 @@ def get_llm_model(provider: str, **kwargs) -> BaseChatModel:
             "moonshot": "moonshot-v1-32k-vision-preview",
             "unbound": "gpt-4o-mini",
             "siliconflow": "Qwen/QwQ-32B",
-            "modelscope": "Qwen/QwQ-32B"
+            "modelscope": "Qwen/QwQ-32B",
+            "mistral": "pixtral-large-latest",
+            "ibm": "ibm/granite-vision-3.1-2b-preview"
         }
         
         base_url = kwargs.get("base_url") or base_url_map[provider]
         if not base_url:
             raise ValueError(f"{provider} endpoint is required")
+        
+        # Special handling for IBM which may require project_id in headers
+        extra_headers = {}
+        if provider == "ibm":
+            project_id = kwargs.get("project_id") or os.getenv("IBM_PROJECT_ID")
+            if project_id:
+                extra_headers["X-Project-ID"] = project_id
             
         return ChatOpenAI(
             model=kwargs.get("model_name", model_defaults[provider]),
             temperature=kwargs.get("temperature", 0.2),
             base_url=base_url,
             api_key=api_key,
+            extra_headers=extra_headers if extra_headers else None,
         )
     
     else:
-        raise ValueError(f"Unsupported provider: {provider}. Supported providers: anthropic, openai, google, groq, ollama, azure_openai, deepseek, grok, alibaba, moonshot, unbound, siliconflow, modelscope")
+        raise ValueError(f"Unsupported provider: {provider}. Supported providers: anthropic, openai, google, groq, ollama, azure_openai, deepseek, grok, alibaba, moonshot, unbound, siliconflow, modelscope, mistral, ibm")
diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py
@@ -256,20 +256,34 @@ def get_setting(tab: str, key: str, default: Any = None):
 
         # --- 7. Task Finalization ---
         logger.info("Agent task processing finished. Awaiting final result...")
-        final_result_dict = await agent_task  # Get result or raise exception
-        logger.info(f"Agent run completed. Result keys: {final_result_dict.keys() if final_result_dict else 'None'}")
+        final_result_path = await agent_task  # Get result path or raise exception
+        logger.info(f"Agent run completed. Result path: {final_result_path}")
 
-        # Try to get task ID from result if not known before
-        if not running_task_id and final_result_dict and 'task_id' in final_result_dict:
-            running_task_id = final_result_dict['task_id']
+        # Try to get task ID from agent's current state if not known before
+        if not running_task_id and webui_manager.dr_agent.current_task_id:
+            running_task_id = webui_manager.dr_agent.current_task_id
             webui_manager.dr_task_id = running_task_id
             task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
             report_file_path = os.path.join(task_specific_dir, "report.md")
-            logger.info(f"Task ID confirmed from result: {running_task_id}")
+            logger.info(f"Task ID confirmed from agent state: {running_task_id}")
 
         final_ui_update = {}
-        if report_file_path and os.path.exists(report_file_path):
-            logger.info(f"Loading final report from: {report_file_path}")
+        
+        # Use the returned report path directly
+        if final_result_path and os.path.exists(final_result_path):
+            logger.info(f"Loading final report from returned path: {final_result_path}")
+            report_content = _read_file_safe(final_result_path)
+            if report_content:
+                final_ui_update[markdown_display_comp] = gr.update(value=report_content)
+                final_ui_update[markdown_download_comp] = gr.File(value=final_result_path,
+                                                                  label=f"Report ({running_task_id or 'research'}.md)",
+                                                                  interactive=True)
+            else:
+                final_ui_update[markdown_display_comp] = gr.update(
+                    value="# Research Complete\n\n*Error reading final report file.*")
+        elif report_file_path and os.path.exists(report_file_path):
+            # Fallback to expected report path if direct path doesn't work
+            logger.info(f"Loading final report from expected path: {report_file_path}")
             report_content = _read_file_safe(report_file_path)
             if report_content:
                 final_ui_update[markdown_display_comp] = gr.update(value=report_content)
@@ -279,15 +293,8 @@ def get_setting(tab: str, key: str, default: Any = None):
             else:
                 final_ui_update[markdown_display_comp] = gr.update(
                     value="# Research Complete\n\n*Error reading final report file.*")
-        elif final_result_dict and 'report' in final_result_dict:
-            logger.info("Using report content directly from agent result.")
-            # If agent directly returns report content
-            final_ui_update[markdown_display_comp] = gr.update(value=final_result_dict['report'])
-            # Cannot offer download if only content is available
-            final_ui_update[markdown_download_comp] = gr.update(value=None, label="Download Research Report",
-                                                                interactive=False)
         else:
-            logger.warning("Final report file not found and not in result dict.")
+            logger.warning("Final report file not found at returned path or expected location.")
             final_ui_update[markdown_display_comp] = gr.update(value="# Research Complete\n\n*Final report not found.*")
 
         yield final_ui_update
diff --git a/tests/test_controller.py b/tests/test_controller.py
@@ -1,136 +1,10 @@
-import asyncio
-import pdb
-import sys
-import time
+# Test file for controller functionality
 
-sys.path.append(".")
 
-from dotenv import load_dotenv
+# MCP functionality has been removed from the application.
+# Controller tests related to MCP have been removed since the functionality no longer exists.
+# This file is kept for potential future controller tests.
 
-load_dotenv()
-
-
-async def test_mcp_client():
-    # MCP functionality removed - test disabled
-    print("MCP functionality has been removed from the application")
-    return
-
-    test_server_config = {
-        "mcpServers": {
-            # "markitdown": {
-            #     "command": "docker",
-            #     "args": [
-            #         "run",
-            #         "--rm",
-            #         "-i",
-            #         "markitdown-mcp:latest"
-            #     ]
-            # },
-            "desktop-commander": {
-                "command": "npx",
-                "args": [
-                    "-y",
-                    "@wonderwhy-er/desktop-commander"
-                ]
-            },
-            # "filesystem": {
-            #     "command": "npx",
-            #     "args": [
-            #         "-y",
-            #         "@modelcontextprotocol/server-filesystem",
-            #         "/Users/xxx/ai_workspace",
-            #     ]
-            # },
-        }
-    }
-
-    mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config)
-
-    for tool in mcp_tools:
-        tool_param_model = create_tool_param_model(tool)
-        print(tool.name)
-        print(tool.description)
-        print(tool_param_model.model_json_schema())
-    pdb.set_trace()
-
-
-async def test_controller_with_mcp():
-    # MCP functionality removed - test disabled  
-    print("MCP functionality has been removed from the application")
-    return
-    import os
-    from src.controller.custom_controller import CustomController
-    from browser_use.controller.registry.views import ActionModel
-
-    mcp_server_config = {
-        "mcpServers": {
-            # "markitdown": {
-            #     "command": "docker",
-            #     "args": [
-            #         "run",
-            #         "--rm",
-            #         "-i",
-            #         "markitdown-mcp:latest"
-            #     ]
-            # },
-            "desktop-commander": {
-                "command": "npx",
-                "args": [
-                    "-y",
-                    "@wonderwhy-er/desktop-commander"
-                ]
-            },
-            # "filesystem": {
-            #     "command": "npx",
-            #     "args": [
-            #         "-y",
-            #         "@modelcontextprotocol/server-filesystem",
-            #         "/Users/xxx/ai_workspace",
-            #     ]
-            # },
-        }
-    }
-
-    controller = CustomController()
-    await controller.setup_mcp_client(mcp_server_config)
-    action_name = "mcp.desktop-commander.execute_command"
-    action_info = controller.registry.registry.actions[action_name]
-    param_model = action_info.param_model
-    print(param_model.model_json_schema())
-    params = {"command": f"python ./tmp/test.py"
-              }
-    validated_params = param_model(**params)
-    ActionModel_ = controller.registry.create_action_model()
-    # Create ActionModel instance with the validated parameters
-    action_model = ActionModel_(**{action_name: validated_params})
-    result = await controller.act(action_model)
-    result = result.extracted_content
-    print(result)
-    if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
-            result.split("\n")[0]:
-        pid = int(result.split("\n")[0].split("PID")[-1].strip())
-        action_name = "mcp.desktop-commander.read_output"
-        action_info = controller.registry.registry.actions[action_name]
-        param_model = action_info.param_model
-        print(param_model.model_json_schema())
-        params = {"pid": pid}
-        validated_params = param_model(**params)
-        action_model = ActionModel_(**{action_name: validated_params})
-        output_result = ""
-        while True:
-            time.sleep(1)
-            result = await controller.act(action_model)
-            result = result.extracted_content
-            if result:
-                pdb.set_trace()
-                output_result = result
-                break
-        print(output_result)
-        pdb.set_trace()
-    await controller.close_mcp_client()
-    pdb.set_trace()
-
-
-if __name__ == '__main__':
-    # asyncio.run(test_mcp_client())
-    asyncio.run(test_controller_with_mcp())
+def test_placeholder():
+    """Placeholder test to maintain test file structure."""
+    pass
diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py
@@ -68,10 +68,13 @@ def test_llm(config, query, image_path=None, system_message=None):
     messages = []
     if system_message:
         messages.append(SystemMessage(content=system_message))
-    messages.append(UserMessage(content=query))
+    
+    # Use create_message_content to handle both text and image content
+    user_content = create_message_content(query, image_path) if image_path else query
+    messages.append(UserMessage(content=user_content))
     
     # Call the LLM
-    ai_msg = llm.ainvoke(messages)
+    ai_msg = llm.invoke(messages)
     
     # Handle different response types
     if hasattr(ai_msg, "reasoning_content"):