Adds vLLM as Option for Local App (#693)

EliMCosta · julien-c · krampstudio · web-flow · commit b15ff77ca4e4 · 2024-09-04T16:55:14.000+02:00
Adds vLLM as option for "Local apps" in Hugginface

---------

Co-authored-by: Julien Chaumond &lt;julien@huggingface.co&gt;
Co-authored-by: Bertrand CHEVRIER &lt;chevrier.bertrand@gmail.com&gt;
Co-authored-by: Bertrand Chevrier &lt;bertrand@huggingface.co&gt;
Co-authored-by: Michael Goin &lt;michael@neuralmagic.com&gt;
Co-authored-by: Pedro Cuenca &lt;pedro@huggingface.co&gt;
diff --git a/packages/tasks/src/local-apps.ts b/packages/tasks/src/local-apps.ts
@@ -58,11 +58,30 @@ export type LocalApp = {
 	  }
 );
 
-// eslint-disable-next-line @typescript-eslint/no-unused-vars
-function isGgufModel(model: ModelData) {
+function isGgufModel(model: ModelData): boolean {
 	return model.tags.includes("gguf");
 }
 
+function isAwqModel(model: ModelData): boolean {
+	return model.config?.quantization_config?.quant_method === "awq";
+}
+
+function isGptqModel(model: ModelData): boolean {
+	return model.config?.quantization_config?.quant_method === "gptq";
+}
+
+function isAqlmModel(model: ModelData): boolean {
+	return model.config?.quantization_config?.quant_method === "aqlm";
+}
+
+function isMarlinModel(model: ModelData): boolean {
+	return model.config?.quantization_config?.quant_method === "marlin";
+}
+
+function isTransformersModel(model: ModelData): boolean {
+	return model.tags.includes("transformers");
+}
+
 function isLlamaCppGgufModel(model: ModelData) {
 	return !!model.gguf?.context_length;
 }
@@ -127,6 +146,47 @@ const snippetLocalAI = (model: ModelData, filepath?: string): LocalAppSnippet[]
 	];
 };
 
+const snippetVllm = (model: ModelData): LocalAppSnippet[] => {
+	const runCommand = [
+		"",
+		"# Call the server using curl:",
+		`curl -X POST "http://localhost:8000/v1/chat/completions" \\ `,
+		`	-H "Content-Type: application/json" \\ `,
+		`	--data '{`,
+		`		"model": "${model.id}"`,
+		`		"messages": [`,
+		`			{"role": "user", "content": "Hello!"}`,
+		`		]`,
+		`	}'`,
+	];
+	return [
+		{
+			title: "Install from pip",
+			setup: ["# Install vLLM from pip:", "pip install vllm"].join("\n"),
+			content: ["# Load and run the model:", `vllm serve "${model.id}"`, ...runCommand].join("\n"),
+		},
+		{
+			title: "Use Docker images",
+			setup: [
+				"# Deploy with docker on Linux:",
+				`docker run --runtime nvidia --gpus all \\`,
+				`	--name my_vllm_container \\`,
+				`	-v ~/.cache/huggingface:/root/.cache/huggingface \\`,
+				` 	--env "HUGGING_FACE_HUB_TOKEN=<secret>" \\`,
+				`	-p 8000:8000 \\`,
+				`	--ipc=host \\`,
+				`	vllm/vllm-openai:latest \\`,
+				`	--model ${model.id}`,
+			].join("\n"),
+			content: [
+				"# Load and run the model:",
+				`docker exec -it my_vllm_container bash -c "vllm serve ${model.id}"`,
+				...runCommand,
+			].join("\n"),
+		},
+	];
+};
+
 /**
  * Add your new local app here.
  *
@@ -146,6 +206,19 @@ export const LOCAL_APPS = {
 		displayOnModelPage: isLlamaCppGgufModel,
 		snippet: snippetLlamacpp,
 	},
+	vllm: {
+		prettyLabel: "vLLM",
+		docsUrl: "https://docs.vllm.ai",
+		mainTask: "text-generation",
+		displayOnModelPage: (model: ModelData) =>
+			isAwqModel(model) ||
+			isGptqModel(model) ||
+			isAqlmModel(model) ||
+			isMarlinModel(model) ||
+			isGgufModel(model) ||
+			isTransformersModel(model),
+		snippet: snippetVllm,
+	},
 	lmstudio: {
 		prettyLabel: "LM Studio",
 		docsUrl: "https://lmstudio.ai",
diff --git a/packages/tasks/src/model-data.ts b/packages/tasks/src/model-data.ts
@@ -38,6 +38,10 @@ export interface ModelData {
 			bits?: number;
 			load_in_4bit?: boolean;
 			load_in_8bit?: boolean;
+			/**
+			 * awq, gptq, aqlm, marlin, … Used by vLLM
+			 */
+			quant_method?: string;
 		};
 		tokenizer_config?: TokenizerConfig;
 		adapter_transformers?: {