From 7dbd31dc2aa70fc90d8fe2306ce8bcb163c014a2 Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Tue, 21 May 2024 18:17:44 -0300 Subject: [PATCH 01/10] Update local-apps.ts Adds vLLM as option for "Local apps" in Hugginface --- packages/tasks/src/local-apps.ts | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index 3ba761f683..f284605ab5 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -63,6 +63,41 @@ LLAMA_CURL=1 make ]; }; +const snippetVllm = (model: ModelData): string[] => { + return [ + ` +## Deploy with docker (needs Docker installed) a gated model (please, request access in Hugginface's model repo): +docker run --runtime nvidia --gpus all \ + --name my_vllm_container \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-Instruct-v0.1 +`, +` +## Load and run the model +docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.1 --dtype auto --api-key token-abc123" +`, +` +## Call the server using the official OpenAI Python client library, or any other HTTP client +from openai import OpenAI +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", +) +completion = client.chat.completions.create( + model="mistralai/Mistral-7B-Instruct-v0.1", + messages=[ + {"role": "user", "content": "Hello!"} + ] +) +print(completion.choices[0].message) +`, + ]; +}; + /** * Add your new local app here. * @@ -82,6 +117,13 @@ export const LOCAL_APPS = { displayOnModelPage: isGgufModel, snippet: snippetLlamacpp, }, + "vllm": { + prettyLabel: "vLLM", + docsUrl: "https://docs.vllm.ai", + mainTask: "text-generation", + displayOnModelPage: isGptqModel && isAwqModel, + snippet: snippetVllm, + }, lmstudio: { prettyLabel: "LM Studio", docsUrl: "https://lmstudio.ai", From 32560d826efc9f7eceb863f1a6266c36f1650d11 Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Wed, 22 May 2024 11:14:59 -0300 Subject: [PATCH 02/10] Update packages/tasks/src/local-apps.ts Co-authored-by: Julien Chaumond --- packages/tasks/src/local-apps.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index f284605ab5..efc80e8f1d 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -74,7 +74,7 @@ docker run --runtime nvidia --gpus all \ -p 8000:8000 \ --ipc=host \ vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-Instruct-v0.1 + --model ${model.id} `, ` ## Load and run the model From f97747a614954655dd6d32fbeb5f7a1b12671a02 Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Wed, 22 May 2024 11:15:12 -0300 Subject: [PATCH 03/10] Update packages/tasks/src/local-apps.ts Co-authored-by: Julien Chaumond --- packages/tasks/src/local-apps.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index efc80e8f1d..a00293cbee 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -78,7 +78,7 @@ docker run --runtime nvidia --gpus all \ `, ` ## Load and run the model -docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.1 --dtype auto --api-key token-abc123" +docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model ${model.id} --dtype auto --api-key token-abc123" `, ` ## Call the server using the official OpenAI Python client library, or any other HTTP client From 82233de60369bd9876db9c41e11c0d372942ebb5 Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Wed, 22 May 2024 11:25:17 -0300 Subject: [PATCH 04/10] Update local-apps.ts fix dynamic model ids --- packages/tasks/src/local-apps.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index a00293cbee..907d0d4fa5 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -88,7 +88,7 @@ client = OpenAI( api_key="token-abc123", ) completion = client.chat.completions.create( - model="mistralai/Mistral-7B-Instruct-v0.1", + model=${model.id}, messages=[ {"role": "user", "content": "Hello!"} ] From 2123430a737a92938d41d22b5fc9229b384a12bc Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 22 May 2024 19:40:55 +0200 Subject: [PATCH 05/10] Validation for `config.quantization_config.quant_method` --- packages/tasks/src/local-apps.ts | 6 +++--- packages/tasks/src/model-data.ts | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index 907d0d4fa5..894b646008 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -76,11 +76,11 @@ docker run --runtime nvidia --gpus all \ vllm/vllm-openai:latest \ --model ${model.id} `, -` + ` ## Load and run the model docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model ${model.id} --dtype auto --api-key token-abc123" `, -` + ` ## Call the server using the official OpenAI Python client library, or any other HTTP client from openai import OpenAI client = OpenAI( @@ -117,7 +117,7 @@ export const LOCAL_APPS = { displayOnModelPage: isGgufModel, snippet: snippetLlamacpp, }, - "vllm": { + vllm: { prettyLabel: "vLLM", docsUrl: "https://docs.vllm.ai", mainTask: "text-generation", diff --git a/packages/tasks/src/model-data.ts b/packages/tasks/src/model-data.ts index 6ee40c15cf..3f9871b81a 100644 --- a/packages/tasks/src/model-data.ts +++ b/packages/tasks/src/model-data.ts @@ -52,6 +52,10 @@ export interface ModelData { bits?: number; load_in_4bit?: boolean; load_in_8bit?: boolean; + /** + * awq, gptq, aqlm, marlin, … Used by vLLM + */ + quant_method?: string; }; tokenizer_config?: TokenizerConfig; adapter_transformers?: { From 43de0e9d71804d330f6a4c17c3ac0366df93435d Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Wed, 22 May 2024 19:27:24 -0300 Subject: [PATCH 06/10] Update local-apps.ts Adds functions in order to check types of quantization --- packages/tasks/src/local-apps.ts | 52 +++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index 894b646008..39e5e26e6c 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -42,8 +42,29 @@ export type LocalApp = { } ); -function isGgufModel(model: ModelData) { - return model.tags.includes("gguf"); +function isGgufModel(model: ModelData): boolean { + return model.config?.quantization_config?.quant_method === "gguf"; +} + +function isAwqModel(model: ModelData): boolean { + return model.config?.quantization_config?.quant_method === "awq"; +} + +function isGptqModel(model: ModelData): boolean { + return model.config?.quantization_config?.quant_method === "gptq"; +} + +function isAqlmModel(model: ModelData): boolean { + return model.config?.quantization_config?.quant_method === "aqlm"; +} + +function isMarlinModel(model: ModelData): boolean { + return model.config?.quantization_config?.quant_method === "marlin"; +} + +function isFullModel(model: ModelData): boolean { + // Assuming a full model is identified by not having a quant_method + return !model.config?.quantization_config?.quant_method; } const snippetLlamacpp = (model: ModelData): string[] => { @@ -66,7 +87,7 @@ LLAMA_CURL=1 make const snippetVllm = (model: ModelData): string[] => { return [ ` -## Deploy with docker (needs Docker installed) a gated model (please, request access in Hugginface's model repo): +## Deploy with linux and docker (needs Docker installed) a gated model (please, request access in Hugginface's model repo): docker run --runtime nvidia --gpus all \ --name my_vllm_container \ -v ~/.cache/huggingface:/root/.cache/huggingface \ @@ -81,19 +102,16 @@ docker run --runtime nvidia --gpus all \ docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model ${model.id} --dtype auto --api-key token-abc123" `, ` -## Call the server using the official OpenAI Python client library, or any other HTTP client -from openai import OpenAI -client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="token-abc123", -) -completion = client.chat.completions.create( - model=${model.id}, - messages=[ - {"role": "user", "content": "Hello!"} - ] -) -print(completion.choices[0].message) +## Call the server using curl +curl -X POST "http://localhost:8000/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer token-abc123" \ + --data '{ + "model": "'${model.id}'", + "messages": [ + {"role": "user", "content": "Hello!"} + ] + }' `, ]; }; @@ -121,7 +139,7 @@ export const LOCAL_APPS = { prettyLabel: "vLLM", docsUrl: "https://docs.vllm.ai", mainTask: "text-generation", - displayOnModelPage: isGptqModel && isAwqModel, + displayOnModelPage: (model: ModelData) => isAwqModel(model) || isGptqModel(model) || isAqlmModel(model) || isMarlinModel(model) || isFullModel(model), snippet: snippetVllm, }, lmstudio: { From 855711051836f1c623bb9cca9e6764ef17860f57 Mon Sep 17 00:00:00 2001 From: Eli Costa <87460497+EliMCosta@users.noreply.github.com> Date: Thu, 30 May 2024 07:41:06 -0300 Subject: [PATCH 07/10] Update packages/tasks/src/local-apps.ts Co-authored-by: Bertrand CHEVRIER --- packages/tasks/src/local-apps.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index 39e5e26e6c..4490474d94 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -43,7 +43,7 @@ export type LocalApp = { ); function isGgufModel(model: ModelData): boolean { - return model.config?.quantization_config?.quant_method === "gguf"; + return model.config?.quantization_config?.quant_method === "gguf" || model.tags.includes("gguf"); } function isAwqModel(model: ModelData): boolean { From 17ad1824f706a695fd218977b092d69574a5da72 Mon Sep 17 00:00:00 2001 From: Bertrand Chevrier Date: Tue, 3 Sep 2024 14:25:24 +0200 Subject: [PATCH 08/10] fix: udpate snippets --- packages/tasks/src/local-apps.ts | 33 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index 28c0ba4f35..575b432eea 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -147,18 +147,28 @@ const snippetLocalAI = (model: ModelData, filepath?: string): LocalAppSnippet[] }; const snippetVllm = (model: ModelData): LocalAppSnippet[] => { + const runCommand = [ + "", + "# Call the server using curl:", + `curl -X POST "http://localhost:8000/v1/chat/completions" \\ `, + ` -H "Content-Type: application/json" \\ `, + ` --data '{`, + ` "model": "${model.id}"`, + ` "messages": [`, + ` {"role": "user", "content": "Hello!"}`, + ` ]`, + ` }'`, + ]; return [ { title: "Install from pip", - setup: ["# Install vLLM from pip", "pip install vllm"].join("\n"), - content: ["# Load and run the model:", `python -m vllm.entrypoints.openai.api_server --model "${model.id}"`].join( - "\n" - ), + setup: ["# Install vLLM from pip:", "pip install vllm"].join("\n"), + content: ["# Load and run the model:", `vllm serve --model "${model.id}"`, ...runCommand].join("\n"), }, { title: "Use Docker images", setup: [ - "# Deploy with linux and docker (needs Docker installed) a gated model (please, request access in Hugginface's model repo): ", + "# Deploy with docker on Linux:", `docker run --runtime nvidia --gpus all \\`, ` --name my_vllm_container \\`, ` -v ~/.cache/huggingface:/root/.cache/huggingface \\`, @@ -170,17 +180,8 @@ const snippetVllm = (model: ModelData): LocalAppSnippet[] => { ].join("\n"), content: [ "# Load and run the model:", - `docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model ${model.id} --dtype auto --api-key token-abc123"`, - "# Call the server using curl", - "curl -X POST 'http://localhost:8000/v1/chat/completions' \\ ", - ` -H "Content-Type: application/json" \\ `, - ` -H "Authorization: Bearer token-abc123" \\ `, - ` --data '{`, - ` "model": "${model.id}"`, - ` "messages": [`, - ` {"role": "user", "content": "Hello!"}`, - ` ]`, - ` }'`, + `docker exec -it my_vllm_container bash -c "vllm serve ${model.id}"`, + ...runCommand, ].join("\n"), }, ]; From 2bb1bc1735c8665edbb5b7f7570a785369de34c1 Mon Sep 17 00:00:00 2001 From: Bertrand CHEVRIER Date: Wed, 4 Sep 2024 10:56:43 +0200 Subject: [PATCH 09/10] Update packages/tasks/src/local-apps.ts Co-authored-by: Michael Goin --- packages/tasks/src/local-apps.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index 575b432eea..8a4709d06a 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -163,7 +163,7 @@ const snippetVllm = (model: ModelData): LocalAppSnippet[] => { { title: "Install from pip", setup: ["# Install vLLM from pip:", "pip install vllm"].join("\n"), - content: ["# Load and run the model:", `vllm serve --model "${model.id}"`, ...runCommand].join("\n"), + content: ["# Load and run the model:", `vllm serve "${model.id}"`, ...runCommand].join("\n"), }, { title: "Use Docker images", From d63b7cb37eab869bd3fb587360809be77b3db47c Mon Sep 17 00:00:00 2001 From: Bertrand Chevrier Date: Wed, 4 Sep 2024 11:10:34 +0200 Subject: [PATCH 10/10] fix: rely only on gguf tag --- packages/tasks/src/local-apps.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts index 8a4709d06a..168ed255b0 100644 --- a/packages/tasks/src/local-apps.ts +++ b/packages/tasks/src/local-apps.ts @@ -59,7 +59,7 @@ export type LocalApp = { ); function isGgufModel(model: ModelData): boolean { - return model.config?.quantization_config?.quant_method === "gguf" || model.tags.includes("gguf"); + return model.tags.includes("gguf"); } function isAwqModel(model: ModelData): boolean {