Skip to content

Commit efe2e77

Browse files
authored
Merge branch 'main' into bench-dataset
2 parents 8440831 + b79a78d commit efe2e77

File tree

12 files changed

+413
-10
lines changed

12 files changed

+413
-10
lines changed

.github/workflows/docker-publish.yml

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Create and publish Docker image for extproc
1+
name: Create and publish Docker images
22

33
on:
44
workflow_dispatch:
@@ -18,7 +18,7 @@ on:
1818
branches: [ "main" ]
1919

2020
jobs:
21-
build_and_push:
21+
build_and_push_extproc:
2222
runs-on: ubuntu-latest
2323
permissions:
2424
contents: read
@@ -43,7 +43,7 @@ jobs:
4343
- name: Set lowercase repository owner
4444
run: echo "REPOSITORY_OWNER_LOWER=$(echo $GITHUB_REPOSITORY_OWNER | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
4545

46-
- name: Build and push Docker image
46+
- name: Build and push extproc Docker image
4747
uses: docker/build-push-action@v5
4848
with:
4949
context: .
@@ -52,3 +52,45 @@ jobs:
5252
tags: |
5353
${{ inputs.is_nightly == true && format('ghcr.io/{0}/semantic-router/extproc:nightly-{1}', env.REPOSITORY_OWNER_LOWER, steps.date.outputs.date_tag) || format('ghcr.io/{0}/semantic-router/extproc:{1}', env.REPOSITORY_OWNER_LOWER, github.sha) }}
5454
${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/extproc:latest', env.REPOSITORY_OWNER_LOWER) || '' }}
55+
56+
build_and_push_llm_katan:
57+
runs-on: ubuntu-latest
58+
permissions:
59+
contents: read
60+
packages: write
61+
62+
steps:
63+
- name: Check out the repo
64+
uses: actions/checkout@v4
65+
66+
- name: Log in to GitHub Container Registry
67+
uses: docker/login-action@v3
68+
with:
69+
registry: ghcr.io
70+
username: ${{ github.actor }}
71+
password: ${{ secrets.GITHUB_TOKEN }}
72+
73+
- name: Generate date tag for nightly builds
74+
id: date
75+
if: inputs.is_nightly == true
76+
run: echo "date_tag=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
77+
78+
- name: Set lowercase repository owner
79+
run: echo "REPOSITORY_OWNER_LOWER=$(echo $GITHUB_REPOSITORY_OWNER | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
80+
81+
- name: Extract version from pyproject.toml
82+
id: version
83+
run: |
84+
VERSION=$(grep '^version = ' e2e-tests/llm-katan/pyproject.toml | sed 's/version = "\(.*\)"/\1/')
85+
echo "version=$VERSION" >> $GITHUB_OUTPUT
86+
87+
- name: Build and push llm-katan Docker image
88+
uses: docker/build-push-action@v5
89+
with:
90+
context: ./e2e-tests/llm-katan
91+
file: ./e2e-tests/llm-katan/Dockerfile
92+
push: ${{ github.event_name != 'pull_request' }} # Only push on merge to main, not on PRs
93+
tags: |
94+
${{ inputs.is_nightly == true && format('ghcr.io/{0}/semantic-router/llm-katan:nightly-{1}', env.REPOSITORY_OWNER_LOWER, steps.date.outputs.date_tag) || format('ghcr.io/{0}/semantic-router/llm-katan:{1}', env.REPOSITORY_OWNER_LOWER, github.sha) }}
95+
${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/llm-katan:latest', env.REPOSITORY_OWNER_LOWER) || '' }}
96+
${{ inputs.is_nightly != true && format('ghcr.io/{0}/semantic-router/llm-katan:v{1}', env.REPOSITORY_OWNER_LOWER, steps.version.outputs.version) || '' }}

.github/workflows/docker-release.yml

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
name: Create and publish Docker release image
1+
name: Create and publish Docker release images
22

33
on:
44
push:
55
tags:
66
- 'v*' # Triggers on version tags like v1.0.0, v2.1.3, etc.
77

88
jobs:
9-
build_and_push:
9+
build_and_push_extproc:
1010
runs-on: ubuntu-latest
1111
permissions:
1212
contents: read
@@ -30,7 +30,7 @@ jobs:
3030
username: ${{ github.actor }}
3131
password: ${{ secrets.GITHUB_TOKEN }}
3232

33-
- name: Build and push Docker image
33+
- name: Build and push extproc Docker image
3434
uses: docker/build-push-action@v5
3535
with:
3636
context: .
@@ -39,3 +39,44 @@ jobs:
3939
tags: |
4040
ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/extproc:${{ steps.extract_tag.outputs.tag }}
4141
ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/extproc:latest
42+
43+
build_and_push_llm_katan:
44+
runs-on: ubuntu-latest
45+
permissions:
46+
contents: read
47+
packages: write
48+
49+
steps:
50+
- name: Check out the repo
51+
uses: actions/checkout@v4
52+
53+
- name: Extract tag name
54+
id: extract_tag
55+
run: echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
56+
57+
- name: Set lowercase repository owner
58+
run: echo "REPOSITORY_OWNER_LOWER=$(echo $GITHUB_REPOSITORY_OWNER | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
59+
60+
- name: Log in to GitHub Container Registry
61+
uses: docker/login-action@v3
62+
with:
63+
registry: ghcr.io
64+
username: ${{ github.actor }}
65+
password: ${{ secrets.GITHUB_TOKEN }}
66+
67+
- name: Extract version from pyproject.toml
68+
id: version
69+
run: |
70+
VERSION=$(grep '^version = ' e2e-tests/llm-katan/pyproject.toml | sed 's/version = "\(.*\)"/\1/')
71+
echo "version=$VERSION" >> $GITHUB_OUTPUT
72+
73+
- name: Build and push llm-katan Docker image
74+
uses: docker/build-push-action@v5
75+
with:
76+
context: ./e2e-tests/llm-katan
77+
file: ./e2e-tests/llm-katan/Dockerfile
78+
push: true
79+
tags: |
80+
ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/llm-katan:${{ steps.extract_tag.outputs.tag }}
81+
ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/llm-katan:v${{ steps.version.outputs.version }}
82+
ghcr.io/${{ env.REPOSITORY_OWNER_LOWER }}/semantic-router/llm-katan:latest

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ _run:
1414
-f tools/make/milvus.mk \
1515
-f tools/make/models.mk \
1616
-f tools/make/pre-commit.mk \
17+
-f tools/make/docker.mk \
1718
-f tools/make/kube.mk \
1819
$(MAKECMDGOALS)
1920

docker-compose.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,21 @@ services:
9393
networks:
9494
- semantic-network
9595

96+
# LLM Katan service for testing
97+
llm-katan:
98+
build:
99+
context: ./e2e-tests/llm-katan
100+
dockerfile: Dockerfile
101+
container_name: llm-katan
102+
profiles: ["testing", "llm-katan"]
103+
ports:
104+
- "8002:8000"
105+
environment:
106+
- HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN:-}
107+
networks:
108+
- semantic-network
109+
command: ["llm-katan", "--model", "Qwen/Qwen3-0.6B", "--host", "0.0.0.0", "--port", "8000"]
110+
96111
networks:
97112
semantic-network:
98113
driver: bridge

e2e-tests/llm-katan/Dockerfile

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# LLM Katan Dockerfile
2+
# Lightweight LLM Server for Testing
3+
FROM python:3.11-slim
4+
5+
LABEL maintainer="vLLM Semantic Router Team"
6+
LABEL description="LLM Katan - Lightweight LLM Server for Testing"
7+
LABEL version="0.1.8"
8+
9+
# Set working directory
10+
WORKDIR /app
11+
12+
# Install system dependencies
13+
RUN apt-get update && apt-get install -y --no-install-recommends \
14+
curl \
15+
git \
16+
&& rm -rf /var/lib/apt/lists/*
17+
18+
# Copy requirements first for better layer caching
19+
COPY requirements.txt ./
20+
RUN pip install --no-cache-dir -r requirements.txt
21+
22+
# Copy the llm_katan package
23+
COPY llm_katan/ ./llm_katan/
24+
COPY pyproject.toml ./
25+
COPY README.md ./
26+
27+
# Install the package in development mode
28+
RUN pip install -e .
29+
30+
# Create a non-root user for security
31+
RUN useradd --create-home --shell /bin/bash llmkatan
32+
USER llmkatan
33+
34+
# Set environment variables
35+
ENV PYTHONUNBUFFERED=1
36+
ENV PYTHONDONTWRITEBYTECODE=1
37+
38+
# Expose the default port
39+
EXPOSE 8000
40+
41+
# Default command - can be overridden
42+
CMD ["llm-katan", "--model", "Qwen/Qwen3-0.6B", "--host", "0.0.0.0", "--port", "8000"]

e2e-tests/llm-katan/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,24 @@ designed for testing and development with real tiny models.
2020

2121
### Installation
2222

23+
#### Option 1: PyPI
24+
2325
```bash
2426
pip install llm-katan
2527
```
2628

29+
#### Option 2: Docker
30+
31+
```bash
32+
# Pull and run the latest Docker image
33+
docker pull ghcr.io/vllm-project/semantic-router/llm-katan:latest
34+
docker run -p 8000:8000 ghcr.io/vllm-project/semantic-router/llm-katan:latest
35+
36+
# Or with custom model
37+
docker run -p 8000:8000 ghcr.io/vllm-project/semantic-router/llm-katan:latest \
38+
llm-katan --served-model-name "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
39+
```
40+
2741
### Setup
2842

2943
#### HuggingFace Token (Required)

src/semantic-router/pkg/extproc/metrics_integration_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,4 +125,42 @@ var _ = Describe("Metrics recording", func() {
125125
Expect(afterPrompt).To(BeNumerically(">", beforePrompt))
126126
Expect(afterCompletion).To(BeNumerically(">", beforeCompletion))
127127
})
128+
129+
It("records TTFT on first streamed body chunk for SSE responses", func() {
130+
ctx := &RequestContext{
131+
RequestModel: "model-stream",
132+
ProcessingStartTime: time.Now().Add(-120 * time.Millisecond),
133+
Headers: map[string]string{"accept": "text/event-stream"},
134+
}
135+
136+
// Simulate header phase: SSE content-type indicates streaming
137+
respHeaders := &ext_proc.ProcessingRequest_ResponseHeaders{
138+
ResponseHeaders: &ext_proc.HttpHeaders{
139+
Headers: &core.HeaderMap{Headers: []*core.HeaderValue{{Key: "content-type", Value: "text/event-stream"}}},
140+
},
141+
}
142+
143+
before := getHistogramSampleCount("llm_model_ttft_seconds", ctx.RequestModel)
144+
145+
// Handle response headers (should NOT record TTFT for streaming)
146+
response1, err := router.handleResponseHeaders(respHeaders, ctx)
147+
Expect(err).NotTo(HaveOccurred())
148+
Expect(response1.GetResponseHeaders()).NotTo(BeNil())
149+
Expect(ctx.IsStreamingResponse).To(BeTrue())
150+
Expect(ctx.TTFTRecorded).To(BeFalse())
151+
152+
// Now simulate the first streamed body chunk
153+
respBody := &ext_proc.ProcessingRequest_ResponseBody{
154+
ResponseBody: &ext_proc.HttpBody{Body: []byte("data: chunk-1\n")},
155+
}
156+
157+
response2, err := router.handleResponseBody(respBody, ctx)
158+
Expect(err).NotTo(HaveOccurred())
159+
Expect(response2.GetResponseBody()).NotTo(BeNil())
160+
161+
after := getHistogramSampleCount("llm_model_ttft_seconds", ctx.RequestModel)
162+
Expect(after).To(BeNumerically(">", before))
163+
Expect(ctx.TTFTRecorded).To(BeTrue())
164+
Expect(ctx.TTFTSeconds).To(BeNumerically(">", 0))
165+
})
128166
})

src/semantic-router/pkg/extproc/request_handler.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,10 @@ type RequestContext struct {
164164
StartTime time.Time
165165
ProcessingStartTime time.Time
166166

167+
// Streaming detection
168+
ExpectStreamingResponse bool // set from request Accept header
169+
IsStreamingResponse bool // set from response Content-Type
170+
167171
// TTFT tracking
168172
TTFTRecorded bool
169173
TTFTSeconds float64
@@ -192,7 +196,14 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques
192196
}
193197
}
194198

195-
// Allow the request to continue
199+
// Detect if the client expects a streaming response (SSE)
200+
if accept, ok := ctx.Headers["accept"]; ok {
201+
if strings.Contains(strings.ToLower(accept), "text/event-stream") {
202+
ctx.ExpectStreamingResponse = true
203+
}
204+
}
205+
206+
// Prepare base response
196207
response := &ext_proc.ProcessingResponse{
197208
Response: &ext_proc.ProcessingResponse_RequestHeaders{
198209
RequestHeaders: &ext_proc.HeadersResponse{
@@ -204,6 +215,10 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques
204215
},
205216
}
206217

218+
// If streaming is expected, we rely on Envoy config to set response_body_mode: STREAMED for SSE.
219+
// Some Envoy/control-plane versions may not support per-message ModeOverride; avoid compile-time coupling here.
220+
// The Accept header is still recorded on context for downstream logic.
221+
207222
return response, nil
208223
}
209224

0 commit comments

Comments
 (0)