diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index d02d3ace680ec..13c743d1bc641 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -9,7 +9,7 @@ Feature: llama.cpp server And 42 as server seed And 2 slots And 1024 as batch size - And 1024 KV cache size + And 2048 KV cache size And embeddings extraction Then the server is starting Then the server is healthy @@ -87,9 +87,8 @@ Feature: llama.cpp server Then the server is idle Then all embeddings are generated - @wip Scenario: All embeddings should be the same - Given 20 fixed prompts + Given 10 fixed prompts And a model bert-bge-small Given concurrent OAI embedding requests Then the server is busy diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index afb63b489f652..58749e3b30ed6 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -292,9 +292,10 @@ def step_impl(context, n_ga_w): def step_prompt_passkey(context): context.prompt_passkey = context.text + @step(u'{n_prompts:d} fixed prompts') def step_fixed_prompts(context, n_prompts): - context.prompts.extend([str(0)*1024 for i in range(n_prompts)]) + context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)]) context.n_prompts = n_prompts @@ -818,7 +819,8 @@ async def request_oai_embeddings(input, "input": input, "model": model, }, - headers=headers) as response: + headers=headers, + timeout=3600) as response: assert response.status == 200, f"received status code not expected: {response.status}" assert response.headers['Access-Control-Allow-Origin'] == origin assert response.headers['Content-Type'] == "application/json; charset=utf-8"