huggingface · krampstudio · Sep 4, 2024 · May 21, 2024 · May 22, 2024 · May 22, 2024
@@ -42,8 +42,29 @@ export type LocalApp = {
 	  }
 );
 
-function isGgufModel(model: ModelData) {
-	return model.tags.includes("gguf");
+function isGgufModel(model: ModelData): boolean {
+    return model.config?.quantization_config?.quant_method === "gguf";
+}
+
+function isAwqModel(model: ModelData): boolean {
+    return model.config?.quantization_config?.quant_method === "awq";
+}
+
+function isGptqModel(model: ModelData): boolean {
+    return model.config?.quantization_config?.quant_method === "gptq";
+}
+
+function isAqlmModel(model: ModelData): boolean {
+    return model.config?.quantization_config?.quant_method === "aqlm";
+}
+
+function isMarlinModel(model: ModelData): boolean {
+    return model.config?.quantization_config?.quant_method === "marlin";
+}
+
+function isFullModel(model: ModelData): boolean {
+    // Assuming a full model is identified by not having a quant_method
+    return !model.config?.quantization_config?.quant_method;
 }
 
 const snippetLlamacpp = (model: ModelData): string[] => {
@@ -63,6 +84,38 @@ LLAMA_CURL=1 make
 	];
 };
 
+const snippetVllm = (model: ModelData): string[] => {
+	return [
+		`
+## Deploy with linux and docker (needs Docker installed) a gated model (please, request access in Hugginface's model repo):
+docker run --runtime nvidia --gpus all \
+    --name my_vllm_container \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model ${model.id}
+`,
+		`
+## Load and run the model
+docker exec -it my_vllm_container bash -c "python -m vllm.entrypoints.openai.api_server --model ${model.id} --dtype auto --api-key token-abc123"
+`,
+		`
+## Call the server using curl
+curl -X POST "http://localhost:8000/v1/chat/completions" \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer token-abc123" \
+     --data '{
+       "model": "'${model.id}'",
+       "messages": [
+         {"role": "user", "content": "Hello!"}
+       ]
+     }'
+`,
+	];
+};
+
 /**
  * Add your new local app here.
  *
@@ -82,6 +135,13 @@ export const LOCAL_APPS = {
 		displayOnModelPage: isGgufModel,
 		snippet: snippetLlamacpp,
 	},
+	vllm: {
+		prettyLabel: "vLLM",
+		docsUrl: "https://docs.vllm.ai",
+		mainTask: "text-generation",
+		displayOnModelPage: (model: ModelData) => isAwqModel(model) || isGptqModel(model) || isAqlmModel(model) || isMarlinModel(model) || isFullModel(model),
+		snippet: snippetVllm,
+	},
 	lmstudio: {
 		prettyLabel: "LM Studio",
 		docsUrl: "https://lmstudio.ai",

@@ -52,6 +52,10 @@ export interface ModelData {
 			bits?: number;
 			load_in_4bit?: boolean;
 			load_in_8bit?: boolean;
+			/**
+			 * awq, gptq, aqlm, marlin, … Used by vLLM
+			 */
+			quant_method?: string;
 		};
 		tokenizer_config?: TokenizerConfig;
 		adapter_transformers?: {