Skip to content

Adds vLLM as Option for Local App #693

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 4, 2024
Merged
77 changes: 75 additions & 2 deletions packages/tasks/src/local-apps.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,30 @@ export type LocalApp = {
}
);

// eslint-disable-next-line @typescript-eslint/no-unused-vars
function isGgufModel(model: ModelData) {
function isGgufModel(model: ModelData): boolean {
return model.tags.includes("gguf");
}

function isAwqModel(model: ModelData): boolean {
return model.config?.quantization_config?.quant_method === "awq";
}

function isGptqModel(model: ModelData): boolean {
return model.config?.quantization_config?.quant_method === "gptq";
}

function isAqlmModel(model: ModelData): boolean {
return model.config?.quantization_config?.quant_method === "aqlm";
}

function isMarlinModel(model: ModelData): boolean {
return model.config?.quantization_config?.quant_method === "marlin";
}

function isTransformersModel(model: ModelData): boolean {
return model.tags.includes("transformers");
}

function isLlamaCppGgufModel(model: ModelData) {
return !!model.gguf?.context_length;
}
Expand Down Expand Up @@ -127,6 +146,47 @@ const snippetLocalAI = (model: ModelData, filepath?: string): LocalAppSnippet[]
];
};

const snippetVllm = (model: ModelData): LocalAppSnippet[] => {
const runCommand = [
"",
"# Call the server using curl:",
`curl -X POST "http://localhost:8000/v1/chat/completions" \\ `,
` -H "Content-Type: application/json" \\ `,
` --data '{`,
` "model": "${model.id}"`,
` "messages": [`,
` {"role": "user", "content": "Hello!"}`,
` ]`,
` }'`,
];
return [
{
title: "Install from pip",
setup: ["# Install vLLM from pip:", "pip install vllm"].join("\n"),
content: ["# Load and run the model:", `vllm serve "${model.id}"`, ...runCommand].join("\n"),
},
{
title: "Use Docker images",
setup: [
"# Deploy with docker on Linux:",
`docker run --runtime nvidia --gpus all \\`,
` --name my_vllm_container \\`,
` -v ~/.cache/huggingface:/root/.cache/huggingface \\`,
` --env "HUGGING_FACE_HUB_TOKEN=<secret>" \\`,
` -p 8000:8000 \\`,
` --ipc=host \\`,
` vllm/vllm-openai:latest \\`,
` --model ${model.id}`,
].join("\n"),
content: [
"# Load and run the model:",
`docker exec -it my_vllm_container bash -c "vllm serve ${model.id}"`,
...runCommand,
].join("\n"),
},
];
};

/**
* Add your new local app here.
*
Expand All @@ -146,6 +206,19 @@ export const LOCAL_APPS = {
displayOnModelPage: isLlamaCppGgufModel,
snippet: snippetLlamacpp,
},
vllm: {
prettyLabel: "vLLM",
docsUrl: "https://docs.vllm.ai",
mainTask: "text-generation",
displayOnModelPage: (model: ModelData) =>
isAwqModel(model) ||
isGptqModel(model) ||
isAqlmModel(model) ||
isMarlinModel(model) ||
isGgufModel(model) ||
isTransformersModel(model),
snippet: snippetVllm,
},
lmstudio: {
prettyLabel: "LM Studio",
docsUrl: "https://lmstudio.ai",
Expand Down
4 changes: 4 additions & 0 deletions packages/tasks/src/model-data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ export interface ModelData {
bits?: number;
load_in_4bit?: boolean;
load_in_8bit?: boolean;
/**
* awq, gptq, aqlm, marlin, … Used by vLLM
*/
quant_method?: string;
};
tokenizer_config?: TokenizerConfig;
adapter_transformers?: {
Expand Down
Loading