[API] llm-vscode extension support (mlc-ai#1198)

davidpissarra · web-flow · commit 145a984940bb · 2023-11-04T23:18:47.000-07:00
This PR enables ```llm-vscode``` extension API support for copilot-like code completion, following [HF's LSP](https://github.com/huggingface/llm-ls). Fully compatible with ```CodeLlama``` and ```starcoder``` on mlc-llm. - huggingface/llm-vscode#103 enhances extension user experience when used with mlc-llm rest api. Thanks @ pacman100, who came up with this on his latest blogpost: https://huggingface.co/blog/personal-copilot
diff --git a/python/mlc_chat/interface/openai_api.py b/python/mlc_chat/interface/openai_api.py
@@ -144,3 +144,18 @@ class EmbeddingsResponse(BaseModel):
     data: List[Dict[str, Any]]
     model: Optional[str] = None
     usage: UsageInfo
+
+
+class VisualStudioCodeCompletionParameters(BaseModel):
+    temperature: float = None
+    top_p: float = None
+    max_new_tokens: int = None
+
+
+class VisualStudioCodeCompletionRequest(BaseModel):
+    inputs: str
+    parameters: VisualStudioCodeCompletionParameters
+
+
+class VisualStudioCodeCompletionResponse(BaseModel):
+    generated_text: str
diff --git a/python/mlc_chat/rest.py b/python/mlc_chat/rest.py
@@ -31,6 +31,8 @@
     EmbeddingsRequest,
     EmbeddingsResponse,
     UsageInfo,
+    VisualStudioCodeCompletionRequest,
+    VisualStudioCodeCompletionResponse,
 )
 
 
@@ -364,6 +366,23 @@ async def read_stats_verbose():
     return session["chat_mod"].stats(verbose=True)
 
 
+@app.post("/v1/llm-vscode/completions")
+async def request_llm_vscode(request: VisualStudioCodeCompletionRequest):
+    """
+    Creates a vscode code completion for a given prompt.
+    Follows huggingface LSP (https://github.com/huggingface/llm-ls)
+    """
+    generation_config = GenerationConfig(
+        temperature=request.parameters.temperature,
+        top_p=request.parameters.top_p,
+        mean_gen_len=request.parameters.max_new_tokens,
+        max_gen_len=request.parameters.max_new_tokens,
+    )
+    msg = session["chat_mod"].generate(prompt=request.inputs, generation_config=generation_config)
+
+    return VisualStudioCodeCompletionResponse(generated_text=msg)
+
+
 ARGS = convert_args_to_argparser().parse_args()
 if __name__ == "__main__":
     uvicorn.run("mlc_chat.rest:app", host=ARGS.host, port=ARGS.port, reload=False, access_log=False)