Skip to content

[New Model]: We can able to run phi-3.5 vision instruct model but wanted to run in int4 quantization  #8463

@thalapandi

Description

@thalapandi

The model to consider.

from typing import List
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from PIL import Image
from vllm import LLM, SamplingParams
import os
import uvicorn
import time

app = FastAPI()

class InferenceRequest(BaseModel):
    model: str
    question: str
    image_paths: List[str]

# Initialize models once during application startup
models = {}

def load_image_from_path(image_path: str) -> Image.Image:
    """Load a PIL image from a local file path."""
    if not os.path.isfile(image_path):
        raise ValueError(f"File {image_path} does not exist.")
    return Image.open(image_path).convert("RGB")

def load_phi3v():
    """Load Phi3V model and return instance."""
    return LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
    )

def initialize_models():
    """Initialize all models required for inference."""
    global models
    models["phi3_v"] = load_phi3v()

def load_phi3v_prompt(question, image_paths: List[str]):
    placeholders = "\n".join(f"<|image_{i}|>" for i, _ in enumerate(image_paths, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
    return prompt, stop_token_ids

def run_generate(model_name: str, question: str, image_paths: List[str]):
    if model_name not in models:
        raise ValueError(f"Model {model_name} is not loaded.")
    
    llm = models[model_name]
    prompt, stop_token_ids = load_phi3v_prompt(question, image_paths)
    image_data = [load_image_from_path(path) for path in image_paths]
    
    sampling_params = SamplingParams(temperature=0.0, max_tokens=128, stop_token_ids=stop_token_ids)
    outputs = llm.generate(
        {
            "prompt": prompt,
            "multi_modal_data": {
                "image": image_data
            },
        },
        sampling_params=sampling_params
    )
    return [o.outputs[0].text for o in outputs]

@app.on_event("startup")
async def startup_event():
    initialize_models()

@app.post("/inference")
async def inference(request: InferenceRequest):
    try:
        start_time = time.time()

        result = run_generate(request.model, request.question, request.image_paths)
        end_time = time.time()
        print("total time taken",end_time-start_time)
        return {"results": result}
    except ValueError as ve:
        raise HTTPException(status_code=400, detail=str(ve))
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8002)

The closest model vllm already supports.

phi-3.5 vision instruct model and need reference for this

What's your difficulty of supporting the model you want?

does not contain any information about quantization for phi-3.5 vision instruct model

Before submitting a new issue...

  • Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.

Metadata

Metadata

Assignees

No one assigned

    Labels

    new-modelRequests to new modelsstaleOver 90 days of inactivity

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions