Skip to content

Commit b15ff77

Browse files
EliMCostajulien-ckrampstudiomgoinpcuenca
authored
Adds vLLM as Option for Local App (#693)
Adds vLLM as option for "Local apps" in Hugginface --------- Co-authored-by: Julien Chaumond <[email protected]> Co-authored-by: Bertrand CHEVRIER <[email protected]> Co-authored-by: Bertrand Chevrier <[email protected]> Co-authored-by: Michael Goin <[email protected]> Co-authored-by: Pedro Cuenca <[email protected]>
1 parent be261ff commit b15ff77

File tree

2 files changed

+79
-2
lines changed

2 files changed

+79
-2
lines changed

packages/tasks/src/local-apps.ts

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,30 @@ export type LocalApp = {
5858
}
5959
);
6060

61-
// eslint-disable-next-line @typescript-eslint/no-unused-vars
62-
function isGgufModel(model: ModelData) {
61+
function isGgufModel(model: ModelData): boolean {
6362
return model.tags.includes("gguf");
6463
}
6564

65+
function isAwqModel(model: ModelData): boolean {
66+
return model.config?.quantization_config?.quant_method === "awq";
67+
}
68+
69+
function isGptqModel(model: ModelData): boolean {
70+
return model.config?.quantization_config?.quant_method === "gptq";
71+
}
72+
73+
function isAqlmModel(model: ModelData): boolean {
74+
return model.config?.quantization_config?.quant_method === "aqlm";
75+
}
76+
77+
function isMarlinModel(model: ModelData): boolean {
78+
return model.config?.quantization_config?.quant_method === "marlin";
79+
}
80+
81+
function isTransformersModel(model: ModelData): boolean {
82+
return model.tags.includes("transformers");
83+
}
84+
6685
function isLlamaCppGgufModel(model: ModelData) {
6786
return !!model.gguf?.context_length;
6887
}
@@ -127,6 +146,47 @@ const snippetLocalAI = (model: ModelData, filepath?: string): LocalAppSnippet[]
127146
];
128147
};
129148

149+
const snippetVllm = (model: ModelData): LocalAppSnippet[] => {
150+
const runCommand = [
151+
"",
152+
"# Call the server using curl:",
153+
`curl -X POST "http://localhost:8000/v1/chat/completions" \\ `,
154+
` -H "Content-Type: application/json" \\ `,
155+
` --data '{`,
156+
` "model": "${model.id}"`,
157+
` "messages": [`,
158+
` {"role": "user", "content": "Hello!"}`,
159+
` ]`,
160+
` }'`,
161+
];
162+
return [
163+
{
164+
title: "Install from pip",
165+
setup: ["# Install vLLM from pip:", "pip install vllm"].join("\n"),
166+
content: ["# Load and run the model:", `vllm serve "${model.id}"`, ...runCommand].join("\n"),
167+
},
168+
{
169+
title: "Use Docker images",
170+
setup: [
171+
"# Deploy with docker on Linux:",
172+
`docker run --runtime nvidia --gpus all \\`,
173+
` --name my_vllm_container \\`,
174+
` -v ~/.cache/huggingface:/root/.cache/huggingface \\`,
175+
` --env "HUGGING_FACE_HUB_TOKEN=<secret>" \\`,
176+
` -p 8000:8000 \\`,
177+
` --ipc=host \\`,
178+
` vllm/vllm-openai:latest \\`,
179+
` --model ${model.id}`,
180+
].join("\n"),
181+
content: [
182+
"# Load and run the model:",
183+
`docker exec -it my_vllm_container bash -c "vllm serve ${model.id}"`,
184+
...runCommand,
185+
].join("\n"),
186+
},
187+
];
188+
};
189+
130190
/**
131191
* Add your new local app here.
132192
*
@@ -146,6 +206,19 @@ export const LOCAL_APPS = {
146206
displayOnModelPage: isLlamaCppGgufModel,
147207
snippet: snippetLlamacpp,
148208
},
209+
vllm: {
210+
prettyLabel: "vLLM",
211+
docsUrl: "https://docs.vllm.ai",
212+
mainTask: "text-generation",
213+
displayOnModelPage: (model: ModelData) =>
214+
isAwqModel(model) ||
215+
isGptqModel(model) ||
216+
isAqlmModel(model) ||
217+
isMarlinModel(model) ||
218+
isGgufModel(model) ||
219+
isTransformersModel(model),
220+
snippet: snippetVllm,
221+
},
149222
lmstudio: {
150223
prettyLabel: "LM Studio",
151224
docsUrl: "https://lmstudio.ai",

packages/tasks/src/model-data.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ export interface ModelData {
3838
bits?: number;
3939
load_in_4bit?: boolean;
4040
load_in_8bit?: boolean;
41+
/**
42+
* awq, gptq, aqlm, marlin, … Used by vLLM
43+
*/
44+
quant_method?: string;
4145
};
4246
tokenizer_config?: TokenizerConfig;
4347
adapter_transformers?: {

0 commit comments

Comments
 (0)