oracle · fede-kamel · Oct 31, 2025 · Oct 31, 2025 · Oct 31, 2025 · Nov 12, 2025
diff --git a/libs/oci/README.md b/libs/oci/README.md
@@ -62,7 +62,7 @@ embeddings.embed_query("What is the meaning of life?")
 ```
 
 ### 4. Use Structured Output
-`ChatOCIGenAI` supports structured output. 
+`ChatOCIGenAI` supports structured output.
 
 <sub>**Note:** The default method is `function_calling`. If default method returns `None` (e.g. for Gemini models), try `json_schema` or `json_mode`.</sub>
 
@@ -79,6 +79,30 @@ structured_llm = llm.with_structured_output(Joke)
 structured_llm.invoke("Tell me a joke about programming")
 ```
 
+### 5. Use Parallel Tool Calling
+Enable parallel tool calling to execute multiple tools simultaneously, improving performance for multi-tool workflows.
+
+```python
+from langchain_oci import ChatOCIGenAI
+
+# Option 1: Set at class level for all tool bindings
+llm = ChatOCIGenAI(
+    model_id="meta.llama-3.3-70b-instruct",  # Works with Meta, Llama, Grok, OpenAI, Mistral
+    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
+    compartment_id="MY_COMPARTMENT_ID",
+    parallel_tool_calls=True  # Enable parallel tool calling
+)
+
+# Option 2: Set per-binding
+llm = ChatOCIGenAI(model_id="xai.grok-4-fast")  # Example with Grok
+llm_with_tools = llm.bind_tools(
+    [get_weather, calculate_tip, get_population],
+    parallel_tool_calls=True  # Tools can execute simultaneously
+)
+```
+
+<sub>**Note:** Parallel tool calling is supported for all models using GenericChatRequest (Meta, Llama, xAI Grok, OpenAI, Mistral). Cohere models will raise an error if this parameter is used.</sub>
+
 
 ## OCI Data Science Model Deployment Examples
 

diff --git a/libs/oci/langchain_oci/chat_models/oci_generative_ai.py b/libs/oci/langchain_oci/chat_models/oci_generative_ai.py
@@ -247,8 +247,13 @@ def chat_generation_info(self, response: Any) -> Dict[str, Any]:
         }
 
         # Include token usage if available
-        if hasattr(response.data.chat_response, "usage") and response.data.chat_response.usage:
-            generation_info["total_tokens"] = response.data.chat_response.usage.total_tokens
+        if (
+            hasattr(response.data.chat_response, "usage")
+            and response.data.chat_response.usage
+        ):
+            generation_info["total_tokens"] = (
+                response.data.chat_response.usage.total_tokens
+            )
 
         # Include tool calls if available
         if self.chat_tool_calls(response):
@@ -342,6 +347,14 @@ def messages_to_oci_params(
 
         This includes conversion of chat history and tool call results.
         """
+        # Cohere models don't support parallel tool calls
+        if kwargs.get("is_parallel_tool_calls"):
+            raise ValueError(
+                "Parallel tool calls are not supported for Cohere models. "
+                "This feature is only available for models using GenericChatRequest "
+                "(Meta, Llama, xAI Grok, OpenAI, Mistral)."
+            )
+
         is_force_single_step = kwargs.get("is_force_single_step", False)
         oci_chat_history = []
 
@@ -622,9 +635,14 @@ def chat_generation_info(self, response: Any) -> Dict[str, Any]:
         }
 
         # Include token usage if available
-        if hasattr(response.data.chat_response, "usage") and response.data.chat_response.usage:
-            generation_info["total_tokens"] = response.data.chat_response.usage.total_tokens
-
+        if (
+            hasattr(response.data.chat_response, "usage")
+            and response.data.chat_response.usage
+        ):
+            generation_info["total_tokens"] = (
+                response.data.chat_response.usage.total_tokens
+            )
+
         if self.chat_tool_calls(response):
             generation_info["tool_calls"] = self.format_response_tool_calls(
                 self.chat_tool_calls(response)
@@ -770,8 +788,7 @@ def messages_to_oci_params(
         # continue calling tools even after receiving results.
 
         def _should_allow_more_tool_calls(
-            messages: List[BaseMessage],
-            max_tool_calls: int
+            messages: List[BaseMessage], max_tool_calls: int
         ) -> bool:
             """
             Determine if the model should be allowed to call more tools.
@@ -787,10 +804,7 @@ def _should_allow_more_tool_calls(
                 max_tool_calls: Maximum number of tool calls before forcing stop
             """
             # Count total tool calls made so far
-            tool_call_count = sum(
-                1 for msg in messages
-                if isinstance(msg, ToolMessage)
-            )
+            tool_call_count = sum(1 for msg in messages if isinstance(msg, ToolMessage))
 
             # Safety limit: prevent runaway tool calling
             if tool_call_count >= max_tool_calls:
@@ -799,12 +813,12 @@ def _should_allow_more_tool_calls(
             # Detect infinite loop: same tool called with same arguments in succession
             recent_calls = []
             for msg in reversed(messages):
-                if hasattr(msg, 'tool_calls') and msg.tool_calls:
+                if hasattr(msg, "tool_calls") and msg.tool_calls:
                     for tc in msg.tool_calls:
                         # Create signature: (tool_name, sorted_args)
                         try:
-                            args_str = json.dumps(tc.get('args', {}), sort_keys=True)
-                            signature = (tc.get('name', ''), args_str)
+                            args_str = json.dumps(tc.get("args", {}), sort_keys=True)
+                            signature = (tc.get("name", ""), args_str)
 
                             # Check if this exact call was made in last 2 calls
                             if signature in recent_calls[-2:]:
@@ -829,6 +843,10 @@ def _should_allow_more_tool_calls(
                 result["tool_choice"] = self.oci_tool_choice_none()
             # else: Allow model to decide (default behavior)
 
+        # Add parallel tool calls support (GenericChatRequest models)
+        if "is_parallel_tool_calls" in kwargs:
+            result["is_parallel_tool_calls"] = kwargs["is_parallel_tool_calls"]
+
         return result
 
     def _process_message_content(
@@ -1142,9 +1160,7 @@ def _prepare_request(
             ) from ex
 
         oci_params = self._provider.messages_to_oci_params(
-            messages,
-            max_sequential_tool_calls=self.max_sequential_tool_calls,
-            **kwargs
+            messages, max_sequential_tool_calls=self.max_sequential_tool_calls, **kwargs
         )
 
         oci_params["is_stream"] = stream
@@ -1154,12 +1170,17 @@ def _prepare_request(
             _model_kwargs[self._provider.stop_sequence_key] = stop
 
         # Warn if using max_tokens with OpenAI models
-        if self.model_id and self.model_id.startswith("openai.") and "max_tokens" in _model_kwargs:
+        if (
+            self.model_id
+            and self.model_id.startswith("openai.")
+            and "max_tokens" in _model_kwargs
+        ):
             import warnings
+
             warnings.warn(
                 f"OpenAI models require 'max_completion_tokens' instead of 'max_tokens'.",
                 UserWarning,
-                stacklevel=2
+                stacklevel=2,
             )
 
         chat_params = {**_model_kwargs, **kwargs, **oci_params}
@@ -1179,13 +1200,57 @@ def _prepare_request(
 
         return request
 
+    def _supports_parallel_tool_calls(self, model_id: str) -> bool:
+        """Check if the model supports parallel tool calling.
+
+        Parallel tool calling is supported for:
+        - Llama 4+ only (tested and verified)
+        - Other GenericChatRequest models (xAI Grok, OpenAI, Mistral)
+
+        Not supported for:
+        - All Llama 3.x versions (3.0, 3.1, 3.2, 3.3)
+        - Cohere models
+
+        Args:
+            model_id: The model identifier (e.g., "meta.llama-4-maverick-17b-128e-instruct-fp8")
+
+        Returns:
+            bool: True if model supports parallel tool calling, False otherwise
+        """
+        import re
+
+        # Extract provider from model_id (e.g., "meta" from "meta.llama-4-maverick-17b-128e-instruct-fp8")
+        provider = model_id.split(".")[0].lower()
+
+        # Cohere models don't support parallel tool calling
+        if provider == "cohere":
+            return False
+
+        # For Meta/Llama models, check version
+        if provider == "meta" and "llama" in model_id.lower():
+            # Extract version number (e.g., "4" from "meta.llama-4-maverick-17b-128e-instruct-fp8")
+            version_match = re.search(r"llama-(\d+)", model_id.lower())
+            if version_match:
+                major = int(version_match.group(1))
+
+                # Only Llama 4+ supports parallel tool calling
+                # Llama 3.x (including 3.3) does NOT support it based on testing
+                if major >= 4:
+                    return True
+
+                return False
+
+        # Other GenericChatRequest models (xAI Grok, OpenAI, Mistral) support it
+        return True
+
     def bind_tools(
         self,
         tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable, BaseTool]],
         *,
         tool_choice: Optional[
             Union[dict, str, Literal["auto", "none", "required", "any"], bool]
         ] = None,
+        parallel_tool_calls: Optional[bool] = None,
         **kwargs: Any,
     ) -> Runnable[LanguageModelInput, BaseMessage]:
         """Bind tool-like objects to this chat model.
@@ -1206,6 +1271,12 @@ def bind_tools(
                     {"type": "function", "function": {"name": <<tool_name>>}}:
                 calls <<tool_name>> tool.
                 - False or None: no effect, default Meta behavior.
+            parallel_tool_calls: Whether to enable parallel function calling.
+                If True, the model can call multiple tools simultaneously.
+                If False, tools are called sequentially.
+                If None (default), uses the class-level parallel_tool_calls setting.
+                Supported for models using GenericChatRequest (Meta, Llama, xAI Grok,
+                OpenAI, Mistral). Not supported for Cohere models.
             kwargs: Any additional parameters are passed directly to
                 :meth:`~langchain_oci.chat_models.oci_generative_ai.ChatOCIGenAI.bind`.
         """
@@ -1215,6 +1286,28 @@ def bind_tools(
         if tool_choice is not None:
             kwargs["tool_choice"] = self._provider.process_tool_choice(tool_choice)
 
+        # Add parallel tool calls support
+        # Use bind-time parameter if provided, else fall back to class default
+        use_parallel = (
+            parallel_tool_calls
+            if parallel_tool_calls is not None
+            else self.parallel_tool_calls
+        )
+        if use_parallel:
+            # Validate model supports parallel tool calling
+            if not self._supports_parallel_tool_calls(self.model_id):
+                if "llama" in self.model_id.lower():
+                    raise ValueError(
+                        f"Parallel tool calls are not supported for {self.model_id}. "
+                        "This feature is only available for Llama 4+ models. "
+                        "Llama 3.x models (including 3.3) do not support parallel tool calling."
+                    )
+                else:
+                    raise ValueError(
+                        f"Parallel tool calls are not supported for {self.model_id}."
+                    )
+            kwargs["is_parallel_tool_calls"] = True
+
         return super().bind(tools=formatted_tools, **kwargs)
 
     def with_structured_output(
@@ -1244,7 +1337,7 @@ def with_structured_output(
                 used. Note that if using "json_mode" then you must include instructions
                 for formatting the output into the desired schema into the model call.
                 If "json_schema" then it allows the user to pass a json schema (or pydantic)
-                to the model for structured output. 
+                to the model for structured output.
             include_raw:
                 If False then only the parsed structured output is returned. If
                 an error occurs during model output parsing it will be raised. If True
@@ -1300,18 +1393,18 @@ def with_structured_output(
                 if is_pydantic_schema
                 else schema
             )
-            
+
             response_json_schema = self._provider.oci_response_json_schema(
                 name=json_schema_dict.get("title", "response"),
                 description=json_schema_dict.get("description", ""),
                 schema=json_schema_dict,
-                is_strict=True
+                is_strict=True,
             )
-            
+
             response_format_obj = self._provider.oci_json_schema_response_format(
                 json_schema=response_json_schema
             )
-            
+
             llm = self.bind(response_format=response_format_obj)
             if is_pydantic_schema:
                 output_parser = PydanticOutputParser(pydantic_object=schema)

diff --git a/libs/oci/langchain_oci/llms/oci_generative_ai.py b/libs/oci/langchain_oci/llms/oci_generative_ai.py
@@ -120,6 +120,13 @@ class OCIGenAIBase(BaseModel, ABC):
     """Maximum tool calls before forcing final answer.
     Prevents infinite loops while allowing multi-step orchestration."""
 
+    parallel_tool_calls: bool = False
+    """Whether to enable parallel function calling during tool use.
+    If True, the model can call multiple tools simultaneously.
+    Supported for all models using GenericChatRequest (Meta, Llama, xAI Grok, OpenAI, Mistral).
+    Not supported for Cohere models.
+    Default: False for backward compatibility."""
+
     model_config = ConfigDict(
         extra="forbid", arbitrary_types_allowed=True, protected_namespaces=()
     )