-
Notifications
You must be signed in to change notification settings - Fork 553
Adds vLLM as Option for Local App #693
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
7dbd31d
32560d8
f97747a
82233de
2123430
43de0e9
8557110
2c3c6c2
17ad182
2bb1bc1
d63b7cb
6fd56ef
0308303
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,8 +42,29 @@ export type LocalApp = { | |
| } | ||
| ); | ||
|
|
||
| function isGgufModel(model: ModelData) { | ||
| return model.tags.includes("gguf"); | ||
| function isGgufModel(model: ModelData): boolean { | ||
| return model.config?.quantization_config?.quant_method === "gguf"; | ||
| } | ||
|
|
||
| function isAwqModel(model: ModelData): boolean { | ||
| return model.config?.quantization_config?.quant_method === "awq"; | ||
| } | ||
|
|
||
| function isGptqModel(model: ModelData): boolean { | ||
| return model.config?.quantization_config?.quant_method === "gptq"; | ||
| } | ||
|
|
||
| function isAqlmModel(model: ModelData): boolean { | ||
| return model.config?.quantization_config?.quant_method === "aqlm"; | ||
| } | ||
|
|
||
| function isMarlinModel(model: ModelData): boolean { | ||
| return model.config?.quantization_config?.quant_method === "marlin"; | ||
| } | ||
|
|
||
| function isFullModel(model: ModelData): boolean { | ||
| // Assuming a full model is identified by not having a quant_method | ||
| return !model.config?.quantization_config?.quant_method; | ||
|
||
| } | ||
|
|
||
| const snippetLlamacpp = (model: ModelData): string[] => { | ||
|
|
@@ -63,6 +84,38 @@ LLAMA_CURL=1 make | |
| ]; | ||
| }; | ||
|
|
||
| const snippetVllm = (model: ModelData): string[] => { | ||
| return [ | ||
| ` | ||
| ## Deploy with linux and docker (needs Docker installed) a gated model (please, request access in Hugginface's model repo): | ||
| docker run --runtime nvidia --gpus all \ | ||
| --name my_vllm_container \ | ||
| -v ~/.cache/huggingface:/root/.cache/huggingface \ | ||
| --env "HUGGING_FACE_HUB_TOKEN=<secret>" \ | ||
| -p 8000:8000 \ | ||
| --ipc=host \ | ||
| vllm/vllm-openai:latest \ | ||
| --model ${model.id} | ||
| `, | ||
| ` | ||
| ## Load and run the model | ||
| docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model ${model.id} --dtype auto --api-key token-abc123" | ||
| `, | ||
| ` | ||
| ## Call the server using curl | ||
| curl -X POST "http://localhost:8000/v1/chat/completions" \ | ||
| -H "Content-Type: application/json" \ | ||
| -H "Authorization: Bearer token-abc123" \ | ||
| --data '{ | ||
| "model": "'${model.id}'", | ||
| "messages": [ | ||
| {"role": "user", "content": "Hello!"} | ||
| ] | ||
| }' | ||
| `, | ||
| ]; | ||
| }; | ||
|
|
||
| /** | ||
| * Add your new local app here. | ||
| * | ||
|
|
@@ -82,6 +135,13 @@ export const LOCAL_APPS = { | |
| displayOnModelPage: isGgufModel, | ||
| snippet: snippetLlamacpp, | ||
| }, | ||
| vllm: { | ||
| prettyLabel: "vLLM", | ||
| docsUrl: "https://docs.vllm.ai", | ||
| mainTask: "text-generation", | ||
| displayOnModelPage: (model: ModelData) => isAwqModel(model) || isGptqModel(model) || isAqlmModel(model) || isMarlinModel(model) || isFullModel(model), | ||
| snippet: snippetVllm, | ||
| }, | ||
| lmstudio: { | ||
| prettyLabel: "LM Studio", | ||
| docsUrl: "https://lmstudio.ai", | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.