add support to multimodal in chat completions (#49)

JuanmaBM · web-flow · commit bef085ff0ac4 · 2025-06-15T17:38:29.000+03:00
* add support to multimodal in chat completions

Signed-off-by: Juanma Barea &lt;juanmabareamartinez@gmail.com&gt;

* fix lint errors

Signed-off-by: Juanma Barea &lt;juanmabareamartinez@gmail.com&gt;

* add image_url as a block

Signed-off-by: Juanma Barea &lt;juanmabareamartinez@gmail.com&gt;

* fix lint errors

Signed-off-by: Juanma Barea &lt;juanmabareamartinez@gmail.com&gt;

* add text separator to allow better tokens number calculation

Signed-off-by: Juanma Barea &lt;juanmabareamartinez@gmail.com&gt;

---------

Signed-off-by: Juanma Barea &lt;juanmabareamartinez@gmail.com&gt;
diff --git a/pkg/llm-d-inference-sim/defs.go b/pkg/llm-d-inference-sim/defs.go
@@ -19,6 +19,8 @@ limitations under the License.
 package llmdinferencesim
 
 import (
+	"encoding/json"
+	"errors"
 	"fmt"
 	"strings"
 	"sync"
@@ -175,7 +177,65 @@ type message struct {
 	// Role is the message Role, optional values are 'user', 'assistant', ...
 	Role string `json:"role,omitempty"`
 	// Content defines text of this message
-	Content string `json:"content,omitempty"`
+	Content content `json:"content,omitempty"`
+}
+
+type content struct {
+	Raw        string
+	Structured []contentBlock
+}
+
+type contentBlock struct {
+	Type     string     `json:"type"`
+	Text     string     `json:"text,omitempty"`
+	ImageURL ImageBlock `json:"image_url,omitempty"`
+}
+
+type ImageBlock struct {
+	Url string `json:"url,omitempty"`
+}
+
+// UnmarshalJSON allow use both format
+func (mc *content) UnmarshalJSON(data []byte) error {
+	// Raw format
+	var str string
+	if err := json.Unmarshal(data, &str); err == nil {
+		mc.Raw = str
+		return nil
+	}
+
+	// Block format
+	var blocks []contentBlock
+	if err := json.Unmarshal(data, &blocks); err == nil {
+		mc.Structured = blocks
+		return nil
+	}
+
+	return errors.New("content format not supported")
+}
+
+func (mc content) MarshalJSON() ([]byte, error) {
+	if mc.Raw != "" {
+		return json.Marshal(mc.Raw)
+	}
+	if mc.Structured != nil {
+		return json.Marshal(mc.Structured)
+	}
+	return json.Marshal("")
+}
+
+func (mc content) PlainText() string {
+	if mc.Raw != "" {
+		return mc.Raw
+	}
+	var sb strings.Builder
+	for _, block := range mc.Structured {
+		if block.Type == "text" {
+			sb.WriteString(block.Text)
+			sb.WriteString(" ")
+		}
+	}
+	return sb.String()
 }
 
 // chatCompletionRequest defines structure of /chat/completion request
@@ -200,7 +260,7 @@ type chatCompletionRequest struct {
 func (c *chatCompletionRequest) getNumberOfPromptTokens() int {
 	var messages string
 	for _, message := range c.Messages {
-		messages += message.Content + " "
+		messages += message.Content.PlainText() + " "
 	}
 	return len(strings.Fields(messages))
 }
@@ -328,7 +388,7 @@ func (req textCompletionRequest) createResponseText(mode string) (string, string
 func (req *chatCompletionRequest) getLastUserMsg() string {
 	for i := len(req.Messages) - 1; i >= 0; i-- {
 		if req.Messages[i].Role == roleUser {
-			return req.Messages[i].Content
+			return req.Messages[i].Content.PlainText()
 		}
 	}
 
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -446,7 +446,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respText
 		baseResp.Object = chatCompletionObject
 		return &chatCompletionResponse{
 			baseCompletionResponse: baseResp,
-			Choices:                []chatRespChoice{{Message: message{Role: roleAssistant, Content: respText}, baseResponseChoice: baseChoice}},
+			Choices:                []chatRespChoice{{Message: message{Role: roleAssistant, Content: content{Raw: respText}}, baseResponseChoice: baseChoice}},
 		}
 	}
 
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -156,7 +156,7 @@ func (s *VllmSimulator) createCompletionChunk(isChatCompletion bool, creationTim
 			chunk.Choices[0].Delta.Role = role
 		}
 		if len(token) > 0 {
-			chunk.Choices[0].Delta.Content = token
+			chunk.Choices[0].Delta.Content.Raw = token
 		}
 
 		return &chunk

Original file line number	Diff line number	Diff line change
`@@ -446,7 +446,7 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respText`
`446`	`446`	`baseResp.Object = chatCompletionObject`
`447`	`447`	`return &chatCompletionResponse{`
`448`	`448`	`baseCompletionResponse: baseResp,`
`449`		`- Choices: []chatRespChoice{{Message: message{Role: roleAssistant, Content: respText}, baseResponseChoice: baseChoice}},`
	`449`	`+ Choices: []chatRespChoice{{Message: message{Role: roleAssistant, Content: content{Raw: respText}}, baseResponseChoice: baseChoice}},`
`450`	`450`	`}`
`451`	`451`	`}`
`452`	`452`
Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,7 @@ func (s *VllmSimulator) createCompletionChunk(isChatCompletion bool, creationTim`
`156`	`156`	`chunk.Choices[0].Delta.Role = role`
`157`	`157`	`}`
`158`	`158`	`if len(token) > 0 {`
`159`		`- chunk.Choices[0].Delta.Content = token`
	`159`	`+ chunk.Choices[0].Delta.Content.Raw = token`
`160`	`160`	`}`
`161`	`161`
`162`	`162`	`return &chunk`