Merge branch 'main' into bench-dataset

rootfs · web-flow · commit efe2e773ddc5 · 2025-09-28T11:54:22.000-05:00
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
@@ -1,4 +1,4 @@
-name: Create and publish Docker image for extproc
+name: Create and publish Docker images
 
 on:
   workflow_dispatch:
@@ -18,7 +18,7 @@ on:
     branches: [ "main" ]
 
 jobs:
-  build_and_push:
+  build_and_push_extproc:
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -43,7 +43,7 @@ jobs:
     - name: Set lowercase repository owner
       run: echo "REPOSITORY_OWNER_LOWER=$(echo $GITHUB_REPOSITORY_OWNER | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
 
-    - name: Build and push Docker image
+    - name: Build and push extproc Docker image
       uses: docker/build-push-action@v5
       with:
         context: .
@@ -52,3 +52,45 @@ jobs:
         tags: |
           ${{ inputs.is_nightly == true && format('ghcr.io/{0}/semantic-router/extproc:nightly-{1}', env.REPOSITORY_OWNER_LOWER, steps.date.outputs.date_tag) || format('ghcr.io/{0}/semantic-router/extproc:{1}', env.REPOSITORY_OWNER_LOWER, github.sha) }}
           ${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/extproc:latest', env.REPOSITORY_OWNER_LOWER) || '' }}
+
+  build_and_push_llm_katan:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+    - name: Check out the repo
+      uses: actions/checkout@v4
+
+    - name: Log in to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Generate date tag for nightly builds
+      id: date
+      if: inputs.is_nightly == true
+      run: echo "date_tag=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
+
+    - name: Set lowercase repository owner
+      run: echo "REPOSITORY_OWNER_LOWER=$(echo $GITHUB_REPOSITORY_OWNER | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
+
+    - name: Extract version from pyproject.toml
+      id: version
+      run: |
+        VERSION=$(grep '^version = ' e2e-tests/llm-katan/pyproject.toml | sed 's/version = "\(.*\)"/\1/')
+        echo "version=$VERSION" >> $GITHUB_OUTPUT
+
+    - name: Build and push llm-katan Docker image
+      uses: docker/build-push-action@v5
+      with:
+        context: ./e2e-tests/llm-katan
+        file: ./e2e-tests/llm-katan/Dockerfile
+        push: ${{ github.event_name != 'pull_request' }} # Only push on merge to main, not on PRs
+        tags: |
+          ${{ inputs.is_nightly == true && format('ghcr.io/{0}/semantic-router/llm-katan:nightly-{1}', env.REPOSITORY_OWNER_LOWER, steps.date.outputs.date_tag) || format('ghcr.io/{0}/semantic-router/llm-katan:{1}', env.REPOSITORY_OWNER_LOWER, github.sha) }}
+          ${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/llm-katan:latest', env.REPOSITORY_OWNER_LOWER) || '' }}
+          ${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/llm-katan:v{1}', env.REPOSITORY_OWNER_LOWER, steps.version.outputs.version) || '' }}
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
@@ -1,12 +1,12 @@
-name: Create and publish Docker release image
+name: Create and publish Docker release images
 
 on:
   push:
     tags:
       - 'v*'  # Triggers on version tags like v1.0.0, v2.1.3, etc.
 
 jobs:
-  build_and_push:
+  build_and_push_extproc:
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -30,7 +30,7 @@ jobs:
         username: ${{ github.actor }}
         password: ${{ secrets.GITHUB_TOKEN }}
 
-    - name: Build and push Docker image
+    - name: Build and push extproc Docker image
       uses: docker/build-push-action@v5
       with:
         context: .
@@ -39,3 +39,44 @@ jobs:
         tags: |
           ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/extproc:${{ steps.extract_tag.outputs.tag }}
           ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/extproc:latest
+
+  build_and_push_llm_katan:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+    - name: Check out the repo
+      uses: actions/checkout@v4
+
+    - name: Extract tag name
+      id: extract_tag
+      run: echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
+
+    - name: Set lowercase repository owner
+      run: echo "REPOSITORY_OWNER_LOWER=$(echo $GITHUB_REPOSITORY_OWNER | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
+
+    - name: Log in to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Extract version from pyproject.toml
+      id: version
+      run: |
+        VERSION=$(grep '^version = ' e2e-tests/llm-katan/pyproject.toml | sed 's/version = "\(.*\)"/\1/')
+        echo "version=$VERSION" >> $GITHUB_OUTPUT
+
+    - name: Build and push llm-katan Docker image
+      uses: docker/build-push-action@v5
+      with:
+        context: ./e2e-tests/llm-katan
+        file: ./e2e-tests/llm-katan/Dockerfile
+        push: true
+        tags: |
+          ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/llm-katan:${{ steps.extract_tag.outputs.tag }}
+          ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/llm-katan:v${{ steps.version.outputs.version }}
+          ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/llm-katan:latest
diff --git a/Makefile b/Makefile
@@ -14,6 +14,7 @@ _run:
 		-f tools/make/milvus.mk \
 		-f tools/make/models.mk \
 		-f tools/make/pre-commit.mk \
+		-f tools/make/docker.mk \
 		-f tools/make/kube.mk \
 		$(MAKECMDGOALS)
 
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -93,6 +93,21 @@ services:
     networks:
       - semantic-network
 
+  # LLM Katan service for testing
+  llm-katan:
+    build:
+      context: ./e2e-tests/llm-katan
+      dockerfile: Dockerfile
+    container_name: llm-katan
+    profiles: ["testing", "llm-katan"]
+    ports:
+      - "8002:8000"
+    environment:
+      - HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN:-}
+    networks:
+      - semantic-network
+    command: ["llm-katan", "--model", "Qwen/Qwen3-0.6B", "--host", "0.0.0.0", "--port", "8000"]
+
 networks:
   semantic-network:
     driver: bridge
diff --git a/e2e-tests/llm-katan/Dockerfile b/e2e-tests/llm-katan/Dockerfile
@@ -0,0 +1,42 @@
+# LLM Katan Dockerfile
+# Lightweight LLM Server for Testing
+FROM python:3.11-slim
+
+LABEL maintainer="vLLM Semantic Router Team"
+LABEL description="LLM Katan - Lightweight LLM Server for Testing"
+LABEL version="0.1.8"
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements first for better layer caching
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the llm_katan package
+COPY llm_katan/ ./llm_katan/
+COPY pyproject.toml ./
+COPY README.md ./
+
+# Install the package in development mode
+RUN pip install -e .
+
+# Create a non-root user for security
+RUN useradd --create-home --shell /bin/bash llmkatan
+USER llmkatan
+
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Expose the default port
+EXPOSE 8000
+
+# Default command - can be overridden
+CMD ["llm-katan", "--model", "Qwen/Qwen3-0.6B", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/e2e-tests/llm-katan/README.md b/e2e-tests/llm-katan/README.md
@@ -20,10 +20,24 @@ designed for testing and development with real tiny models.
 
 ### Installation
 
+#### Option 1: PyPI
+
 ```bash
 pip install llm-katan
 ```
 
+#### Option 2: Docker
+
+```bash
+# Pull and run the latest Docker image
+docker pull ghcr.io/vllm-project/semantic-router/llm-katan:latest
+docker run -p 8000:8000 ghcr.io/vllm-project/semantic-router/llm-katan:latest
+
+# Or with custom model
+docker run -p 8000:8000 ghcr.io/vllm-project/semantic-router/llm-katan:latest \
+  llm-katan --served-model-name "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+```
+
 ### Setup
 
 #### HuggingFace Token (Required)
diff --git a/src/semantic-router/pkg/extproc/metrics_integration_test.go b/src/semantic-router/pkg/extproc/metrics_integration_test.go
@@ -125,4 +125,42 @@ var _ = Describe("Metrics recording", func() {
 		Expect(afterPrompt).To(BeNumerically(">", beforePrompt))
 		Expect(afterCompletion).To(BeNumerically(">", beforeCompletion))
 	})
+
+	It("records TTFT on first streamed body chunk for SSE responses", func() {
+		ctx := &RequestContext{
+			RequestModel:        "model-stream",
+			ProcessingStartTime: time.Now().Add(-120 * time.Millisecond),
+			Headers:             map[string]string{"accept": "text/event-stream"},
+		}
+
+		// Simulate header phase: SSE content-type indicates streaming
+		respHeaders := &ext_proc.ProcessingRequest_ResponseHeaders{
+			ResponseHeaders: &ext_proc.HttpHeaders{
+				Headers: &core.HeaderMap{Headers: []*core.HeaderValue{{Key: "content-type", Value: "text/event-stream"}}},
+			},
+		}
+
+		before := getHistogramSampleCount("llm_model_ttft_seconds", ctx.RequestModel)
+
+		// Handle response headers (should NOT record TTFT for streaming)
+		response1, err := router.handleResponseHeaders(respHeaders, ctx)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(response1.GetResponseHeaders()).NotTo(BeNil())
+		Expect(ctx.IsStreamingResponse).To(BeTrue())
+		Expect(ctx.TTFTRecorded).To(BeFalse())
+
+		// Now simulate the first streamed body chunk
+		respBody := &ext_proc.ProcessingRequest_ResponseBody{
+			ResponseBody: &ext_proc.HttpBody{Body: []byte("data: chunk-1\n")},
+		}
+
+		response2, err := router.handleResponseBody(respBody, ctx)
+		Expect(err).NotTo(HaveOccurred())
+		Expect(response2.GetResponseBody()).NotTo(BeNil())
+
+		after := getHistogramSampleCount("llm_model_ttft_seconds", ctx.RequestModel)
+		Expect(after).To(BeNumerically(">", before))
+		Expect(ctx.TTFTRecorded).To(BeTrue())
+		Expect(ctx.TTFTSeconds).To(BeNumerically(">", 0))
+	})
 })
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
@@ -164,6 +164,10 @@ type RequestContext struct {
 	StartTime           time.Time
 	ProcessingStartTime time.Time
 
+	// Streaming detection
+	ExpectStreamingResponse bool // set from request Accept header
+	IsStreamingResponse     bool // set from response Content-Type
+
 	// TTFT tracking
 	TTFTRecorded bool
 	TTFTSeconds  float64
@@ -192,7 +196,14 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques
 		}
 	}
 
-	// Allow the request to continue
+	// Detect if the client expects a streaming response (SSE)
+	if accept, ok := ctx.Headers["accept"]; ok {
+		if strings.Contains(strings.ToLower(accept), "text/event-stream") {
+			ctx.ExpectStreamingResponse = true
+		}
+	}
+
+	// Prepare base response
 	response := &ext_proc.ProcessingResponse{
 		Response: &ext_proc.ProcessingResponse_RequestHeaders{
 			RequestHeaders: &ext_proc.HeadersResponse{
@@ -204,6 +215,10 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques
 		},
 	}
 
+	// If streaming is expected, we rely on Envoy config to set response_body_mode: STREAMED for SSE.
+	// Some Envoy/control-plane versions may not support per-message ModeOverride; avoid compile-time coupling here.
+	// The Accept header is still recorded on context for downstream logic.
+
 	return response, nil
 }
 
diff --git a/src/semantic-router/pkg/extproc/response_handler.go b/src/semantic-router/pkg/extproc/response_handler.go
diff --git a/tools/make/docker.mk b/tools/make/docker.mk
diff --git a/website/docs/api/router.md b/website/docs/api/router.md
diff --git a/website/docs/overview/architecture/envoy-extproc.md b/website/docs/overview/architecture/envoy-extproc.md