From da68b09d692154773eabd9a76b9163b608a31edf Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Wed, 5 Feb 2025 19:23:03 +0000 Subject: [PATCH 1/4] VLM Pipeline for onboarding of VLMs Signed-off-by: Dipankar Sarkar --- QEfficient/base/modeling_qeff.py | 2 + QEfficient/base/pytorch_transforms.py | 9 + QEfficient/transformers/modeling_utils.py | 9 + .../transformers/models/InternVL/__init__.py | 6 + .../transformers/models/InternVL/config.json | 137 +++ .../InternVL/configuration_internvl_chat.py | 205 ++++ .../models/InternVL/conversation.py | 385 +++++++ .../models/InternVL/internprocessor.py | 150 +++ .../InternVL/modeling_internvl_chat_vision.py | 945 ++++++++++++++++++ .../transformers/models/llava/__init__.py | 7 + .../models/llava/modeling_llava.py | 256 +++++ .../transformers/models/modeling_auto.py | 452 ++++++++- .../transformers/models/pytorch_transforms.py | 15 + QEfficient/utils/__init__.py | 4 + QEfficient/utils/_utils.py | 14 + QEfficient/utils/constants.py | 9 + .../models/test_image_text_to_text.py | 211 ++++ 17 files changed, 2814 insertions(+), 2 deletions(-) create mode 100755 QEfficient/transformers/models/InternVL/__init__.py create mode 100755 QEfficient/transformers/models/InternVL/config.json create mode 100755 QEfficient/transformers/models/InternVL/configuration_internvl_chat.py create mode 100755 QEfficient/transformers/models/InternVL/conversation.py create mode 100644 QEfficient/transformers/models/InternVL/internprocessor.py create mode 100755 QEfficient/transformers/models/InternVL/modeling_internvl_chat_vision.py create mode 100755 QEfficient/transformers/models/llava/__init__.py create mode 100755 QEfficient/transformers/models/llava/modeling_llava.py create mode 100755 tests/transformers/models/test_image_text_to_text.py diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 2760cf52f..c2440f493 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -223,6 +223,7 @@ def _compile( if onnx_path is None and self.onnx_path is None: self.export() + # import ipdb; ipdb.set_trace() onnx_path = Path(onnx_path or self.onnx_path) compile_dir = Path(compile_dir or onnx_path.parent) qpc_path = compile_dir / "qpc" @@ -251,6 +252,7 @@ def _compile( if num_speculative_tokens: compile_hash.update(to_hashable({"num_speculative_tokens": num_speculative_tokens})) + # import ipdb; ipdb.set_trace() # Check if already compiled compile_hash = compile_hash.hexdigest()[:16] compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash) diff --git a/QEfficient/base/pytorch_transforms.py b/QEfficient/base/pytorch_transforms.py index 6e21d11b2..62a832f61 100644 --- a/QEfficient/base/pytorch_transforms.py +++ b/QEfficient/base/pytorch_transforms.py @@ -40,12 +40,21 @@ class ModuleMappingTransform(PytorchTransform): def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: transformed = False for module in model.modules(): + # if repl_module := cls._module_mapping.get(type(module)): + if repl_module := cls._module_mapping.get(module.__class__.__name__): + module.__class__ = repl_module + # Handling the __init__ calls in the models + if hasattr(module, "__qeff_init__"): + module.__qeff_init__() + transformed = True + if repl_module := cls._module_mapping.get(type(module)): module.__class__ = repl_module # Handling the __init__ calls in the models if hasattr(module, "__qeff_init__"): module.__qeff_init__() transformed = True + return model, transformed @classmethod diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index f749cc0c3..dccf56f0b 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -77,6 +77,11 @@ ) from QEfficient.customop import CustomRMSNormAIC +from QEfficient.transformers.models.InternVL.modeling_internvl_chat_vision import ( + InternVLChatModel, + QEffInternVisionEmbeddings, + QEffInternVLChatModel, +) from .models.codegen.modeling_codegen import ( QEffCodeGenAttention, @@ -157,6 +162,7 @@ Starcoder2ForCausalLM.__name__, GPTBigCodeForCausalLM.__name__, MllamaForCausalLM.__name__, + InternVLChatModel.__name__, ] ) @@ -241,4 +247,7 @@ GPTBigCodeAttention: QEffGPTBigCodeAttention, GPTBigCodeBlock: QEffGPTBigCodeBlock, GPTBigCodeModel: QEffGPTBigCodeModel, + # InternVL + "InternVLChatModel": QEffInternVLChatModel, + "InternVisionEmbeddings": QEffInternVisionEmbeddings, } diff --git a/QEfficient/transformers/models/InternVL/__init__.py b/QEfficient/transformers/models/InternVL/__init__.py new file mode 100755 index 000000000..d259e435a --- /dev/null +++ b/QEfficient/transformers/models/InternVL/__init__.py @@ -0,0 +1,6 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- diff --git a/QEfficient/transformers/models/InternVL/config.json b/QEfficient/transformers/models/InternVL/config.json new file mode 100755 index 000000000..83c904f39 --- /dev/null +++ b/QEfficient/transformers/models/InternVL/config.json @@ -0,0 +1,137 @@ +{ + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "llm_config": { + "_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct", + "add_cross_attention": false, + "architectures": [ + "Qwen2ForCausalLM" + ], + "_attn_implementation": "eager", + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 896, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4864, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "min_length": 0, + "model_type": "qwen2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 14, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 2, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sep_token_id": null, + "sliding_window": 32768, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151674 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "ps_version": "v2", + "select_layer": -1, + "template": "internvl2_5", + "torch_dtype": "float32", + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "drop_path_rate": 0.0, + "dropout": 0.0, + "hidden_act": "gelu", + "hidden_size": 1024, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-06, + "model_type": "intern_vit_6b", + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 24, + "output_attentions": false, + "output_hidden_states": false, + "patch_size": 14, + "qk_normalization": false, + "qkv_bias": true, + "return_dict": true, + "torch_dtype": "float32", + "transformers_version": "4.37.2", + "use_bfloat16": false, + "use_flash_attn": false + } +} diff --git a/QEfficient/transformers/models/InternVL/configuration_internvl_chat.py b/QEfficient/transformers/models/InternVL/configuration_internvl_chat.py new file mode 100755 index 000000000..dcb948a0a --- /dev/null +++ b/QEfficient/transformers/models/InternVL/configuration_internvl_chat.py @@ -0,0 +1,205 @@ +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- + +import copy +import os +from typing import Union + +from transformers import LlamaConfig, Qwen2Config +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +# from .configuration_intern_vit import InternVisionConfig + +logger = logging.get_logger(__name__) + + +class InternVLChatConfig(PretrainedConfig): + model_type = "internvl_chat" + is_composition = True + + def __init__( + self, + vision_config=None, + llm_config=None, + use_backbone_lora=0, + use_llm_lora=0, + select_layer=-1, + force_image_size=None, + downsample_ratio=0.5, + template=None, + dynamic_image_size=False, + use_thumbnail=False, + ps_version="v1", + min_dynamic_patch=1, + max_dynamic_patch=6, + **kwargs, + ): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {"architectures": ["InternVisionModel"]} + logger.info("vision_config is None. Initializing the InternVisionConfig with default values.") + + if llm_config is None: + llm_config = {"architectures": ["Qwen2ForCausalLM"]} + logger.info("llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).") + + self.vision_config = InternVisionConfig(**vision_config) + if llm_config.get("architectures")[0] == "LlamaForCausalLM": + self.llm_config = LlamaConfig(**llm_config) + elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM": + self.llm_config = Qwen2Config(**llm_config) + else: + raise ValueError("Unsupported architecture: {}".format(llm_config.get("architectures")[0])) + self.use_backbone_lora = use_backbone_lora + self.use_llm_lora = use_llm_lora + self.select_layer = select_layer + self.force_image_size = force_image_size + self.downsample_ratio = downsample_ratio + self.template = template + self.dynamic_image_size = dynamic_image_size + self.use_thumbnail = use_thumbnail + self.ps_version = ps_version # pixel shuffle version + self.min_dynamic_patch = min_dynamic_patch + self.max_dynamic_patch = max_dynamic_patch + + logger.info(f"vision_select_layer: {self.select_layer}") + logger.info(f"ps_version: {self.ps_version}") + logger.info(f"min_dynamic_patch: {self.min_dynamic_patch}") + logger.info(f"max_dynamic_patch: {self.max_dynamic_patch}") + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["vision_config"] = self.vision_config.to_dict() + output["llm_config"] = self.llm_config.to_dict() + output["model_type"] = self.__class__.model_type + output["use_backbone_lora"] = self.use_backbone_lora + output["use_llm_lora"] = self.use_llm_lora + output["select_layer"] = self.select_layer + output["force_image_size"] = self.force_image_size + output["downsample_ratio"] = self.downsample_ratio + output["template"] = self.template + output["dynamic_image_size"] = self.dynamic_image_size + output["use_thumbnail"] = self.use_thumbnail + output["ps_version"] = self.ps_version + output["min_dynamic_patch"] = self.min_dynamic_patch + output["max_dynamic_patch"] = self.max_dynamic_patch + + return output + + +class InternVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to + instantiate a vision encoder according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + Number of color channels in the input images (e.g., 3 for RGB). + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + qkv_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries and values in the self-attention layers. + hidden_size (`int`, *optional*, defaults to 3200): + Dimensionality of the encoder layers and the pooler layer. + num_attention_heads (`int`, *optional*, defaults to 25): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 12800): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + qk_normalization (`bool`, *optional*, defaults to `True`): + Whether to normalize the queries and keys in the self-attention layers. + num_hidden_layers (`int`, *optional*, defaults to 48): + Number of hidden layers in the Transformer encoder. + use_flash_attn (`bool`, *optional*, defaults to `True`): + Whether to use flash attention mechanism. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-6): + The epsilon used by the layer normalization layers. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Dropout rate for stochastic depth. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 0.1): + A factor for layer scale. + """ + + model_type = "intern_vit_6b" + + def __init__( + self, + num_channels=3, + patch_size=14, + image_size=224, + qkv_bias=False, + hidden_size=3200, + num_attention_heads=25, + intermediate_size=12800, + qk_normalization=True, + num_hidden_layers=48, + use_flash_attn=True, + hidden_act="gelu", + norm_type="rms_norm", + layer_norm_eps=1e-6, + dropout=0.0, + drop_path_rate=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=0.1, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.drop_path_rate = drop_path_rate + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.norm_type = norm_type + self.qkv_bias = qkv_bias + self.qk_normalization = qk_normalization + self.use_flash_attn = use_flash_attn + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + if "vision_config" in config_dict: + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) diff --git a/QEfficient/transformers/models/InternVL/conversation.py b/QEfficient/transformers/models/InternVL/conversation.py new file mode 100755 index 000000000..956ac5c1e --- /dev/null +++ b/QEfficient/transformers/models/InternVL/conversation.py @@ -0,0 +1,385 @@ +""" +Conversation prompt templates. + +We kindly request that you import fastchat instead of copying this file if you wish to use it. +If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates. + +Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py +""" + +import dataclasses +from enum import IntEnum, auto +from typing import Dict, List, Tuple, Union + + +class SeparatorStyle(IntEnum): + """Separator styles.""" + + ADD_COLON_SINGLE = auto() + ADD_COLON_TWO = auto() + ADD_COLON_SPACE_SINGLE = auto() + NO_COLON_SINGLE = auto() + NO_COLON_TWO = auto() + ADD_NEW_LINE_SINGLE = auto() + LLAMA2 = auto() + CHATGLM = auto() + CHATML = auto() + CHATINTERN = auto() + DOLLY = auto() + RWKV = auto() + PHOENIX = auto() + ROBIN = auto() + FALCON_CHAT = auto() + CHATGLM3 = auto() + INTERNVL_ZH = auto() + MPT = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that manages prompt templates and keeps all conversation history.""" + + # The name of this template + name: str + # The template of the system prompt + system_template: str = "{system_message}" + # The system message + system_message: str = "" + # The names of two roles + roles: Tuple[str] = ("USER", "ASSISTANT") + # All messages. Each item is (role, message). + messages: List[List[str]] = () + # The number of few shot examples + offset: int = 0 + # The separator style and configurations + sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE + sep: str = "\n" + sep2: str = None + # Stop criteria (the default one is EOS token) + stop_str: Union[str, List[str]] = None + # Stops generation if meeting any token in this list + stop_token_ids: List[int] = None + + def get_prompt(self) -> str: + """Get the prompt for generation.""" + system_prompt = self.system_template.format(system_message=self.system_message) + if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE: + ret = system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + ": " + message + self.sep + else: + ret += role + ":" + return ret + elif self.sep_style == SeparatorStyle.ADD_COLON_TWO: + seps = [self.sep, self.sep2] + ret = system_prompt + seps[0] + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + return ret + elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE: + ret = system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + ": " + message + self.sep + else: + ret += role + ": " # must be end with a space + return ret + elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE: + ret = "" if system_prompt == "" else system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + "\n" + message + self.sep + else: + ret += role + "\n" + return ret + elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE: + ret = system_prompt + for role, message in self.messages: + if message: + ret += role + message + self.sep + else: + ret += role + return ret + elif self.sep_style == SeparatorStyle.NO_COLON_TWO: + seps = [self.sep, self.sep2] + ret = system_prompt + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + message + seps[i % 2] + else: + ret += role + return ret + elif self.sep_style == SeparatorStyle.RWKV: + ret = system_prompt + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ": " + message.replace("\r\n", "\n").replace("\n\n", "\n") + ret += "\n\n" + else: + ret += role + ":" + return ret + elif self.sep_style == SeparatorStyle.LLAMA2: + seps = [self.sep, self.sep2] + if self.system_message: + ret = system_prompt + else: + ret = "[INST] " + for i, (role, message) in enumerate(self.messages): + tag = self.roles[i % 2] + if message: + if i == 0: + ret += message + " " + else: + ret += tag + " " + message + seps[i % 2] + else: + ret += tag + return ret + elif self.sep_style == SeparatorStyle.CHATGLM: + # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308 + # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926 + round_add_n = 1 if self.name == "chatglm2" else 0 + if system_prompt: + ret = system_prompt + self.sep + else: + ret = "" + + for i, (role, message) in enumerate(self.messages): + if i % 2 == 0: + ret += f"[Round {i // 2 + round_add_n}]{self.sep}" + + if message: + ret += f"{role}:{message}{self.sep}" + else: + ret += f"{role}:" + return ret + elif self.sep_style == SeparatorStyle.CHATML: + ret = "" if system_prompt == "" else system_prompt + self.sep + "\n" + for role, message in self.messages: + if message: + ret += role + "\n" + message + self.sep + "\n" + else: + ret += role + "\n" + return ret + elif self.sep_style == SeparatorStyle.CHATGLM3: + ret = "" + if self.system_message: + ret += system_prompt + for role, message in self.messages: + if message: + ret += role + "\n" + " " + message + else: + ret += role + return ret + elif self.sep_style == SeparatorStyle.CHATINTERN: + # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771 + seps = [self.sep, self.sep2] + ret = system_prompt + for i, (role, message) in enumerate(self.messages): + # if i % 2 == 0: + # ret += "" + if message: + ret += role + ":" + message + seps[i % 2] + "\n" + else: + ret += role + ":" + return ret + elif self.sep_style == SeparatorStyle.DOLLY: + seps = [self.sep, self.sep2] + ret = system_prompt + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ":\n" + message + seps[i % 2] + if i % 2 == 1: + ret += "\n\n" + else: + ret += role + ":\n" + return ret + elif self.sep_style == SeparatorStyle.PHOENIX: + ret = system_prompt + for role, message in self.messages: + if message: + ret += role + ": " + "" + message + "" + else: + ret += role + ": " + "" + return ret + elif self.sep_style == SeparatorStyle.ROBIN: + ret = system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + ":\n" + message + self.sep + else: + ret += role + ":\n" + return ret + elif self.sep_style == SeparatorStyle.FALCON_CHAT: + ret = "" + if self.system_message: + ret += system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + ": " + message + self.sep + else: + ret += role + ":" + + return ret + elif self.sep_style == SeparatorStyle.INTERNVL_ZH: + seps = [self.sep, self.sep2] + ret = self.system_message + seps[0] + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + return ret + elif self.sep_style == SeparatorStyle.MPT: + ret = system_prompt + self.sep + for role, message in self.messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + message + self.sep + else: + ret += role + return ret + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + def set_system_message(self, system_message: str): + """Set the system message.""" + self.system_message = system_message + + def append_message(self, role: str, message: str): + """Append a new message.""" + self.messages.append([role, message]) + + def update_last_message(self, message: str): + """Update the last output. + + The last message is typically set to be None when constructing the prompt, + so we need to update it in-place after getting the response from a model. + """ + self.messages[-1][1] = message + + def to_gradio_chatbot(self): + """Convert the conversation to gradio chatbot format.""" + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def to_openai_api_messages(self): + """Convert the conversation to OpenAI chat completion format.""" + ret = [{"role": "system", "content": self.system_message}] + + for i, (_, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + ret.append({"role": "user", "content": msg}) + else: + if msg is not None: + ret.append({"role": "assistant", "content": msg}) + return ret + + def copy(self): + return Conversation( + name=self.name, + system_template=self.system_template, + system_message=self.system_message, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + stop_str=self.stop_str, + stop_token_ids=self.stop_token_ids, + ) + + def dict(self): + return { + "template_name": self.name, + "system_message": self.system_message, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + } + + +# A global registry for all conversation templates +conv_templates: Dict[str, Conversation] = {} + + +def register_conv_template(template: Conversation, override: bool = False): + """Register a new conversation template.""" + if not override: + assert template.name not in conv_templates, f"{template.name} has been registered." + + conv_templates[template.name] = template + + +def get_conv_template(name: str) -> Conversation: + """Get a conversation template.""" + return conv_templates[name].copy() + + +# Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference +# is that during training, the preprocessing function for the Hermes-2 template doesn't add +# at the beginning of the tokenized sequence, while the internlm2-chat template does. +# Therefore, they are completely equivalent during inference. +register_conv_template( + Conversation( + name="Hermes-2", + system_template="<|im_start|>system\n{system_message}", + # note: The new system prompt was not used here to avoid changes in benchmark performance. + # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。', + system_message="你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", + stop_str="<|endoftext|>", + ) +) + + +register_conv_template( + Conversation( + name="internlm2-chat", + system_template="<|im_start|>system\n{system_message}", + # note: The new system prompt was not used here to avoid changes in benchmark performance. + # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。', + system_message="你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>", + ) +) + + +register_conv_template( + Conversation( + name="phi3-chat", + system_template="<|system|>\n{system_message}", + # note: The new system prompt was not used here to avoid changes in benchmark performance. + # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。', + system_message="你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。", + roles=("<|user|>\n", "<|assistant|>\n"), + sep_style=SeparatorStyle.MPT, + sep="<|end|>", + ) +) + + +register_conv_template( + Conversation( + name="internvl2_5", + system_template="<|im_start|>system\n{system_message}", + system_message="你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。", + roles=("<|im_start|>user\n", "<|im_start|>assistant\n"), + sep_style=SeparatorStyle.MPT, + sep="<|im_end|>\n", + ) +) diff --git a/QEfficient/transformers/models/InternVL/internprocessor.py b/QEfficient/transformers/models/InternVL/internprocessor.py new file mode 100644 index 000000000..114271324 --- /dev/null +++ b/QEfficient/transformers/models/InternVL/internprocessor.py @@ -0,0 +1,150 @@ +import torch +import torch.nn as nn +import torchvision.transforms as T +from PIL import Image +from torchvision.transforms.functional import InterpolationMode + +from QEfficient.utils import get_conv_template + +# from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers + +# from QEfficient.utils._utils import load_hf_processor + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +class InternProcessor: + def __init__(self, model: nn.Module, tokenizer): + breakpoint() + self.model = model + image_size = model.config.force_image_size or model.config.vision_config.image_size + patch_size = model.config.vision_config.patch_size + self.template = model.config.template + self.conv_template = get_conv_template(self.template) + self.system_message = self.conv_template.system_message + self.num_image_token = int((image_size // patch_size) ** 2 * (model.config.downsample_ratio**2)) + self.tokenizer = tokenizer + + def build_transform(self, input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose( + [ + T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD), + ] + ) + return transform + + def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num + ) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = self.find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size + ) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size, + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + def load_image(self, image_file, input_size=448, max_num=12): + image = Image.open(image_file).convert("RGB") + # import ipdb; ipdb.set_trace() + transform = self.build_transform(input_size=input_size) + images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + def __call__( + self, + tokenizer, + pixel_values, + question, + history=None, + return_history=False, + num_patches_list=None, + IMG_START_TOKEN="", + IMG_END_TOKEN="", + IMG_CONTEXT_TOKEN="", + verbose=False, + ) -> str: + if history is None and pixel_values is not None and "" not in question: + question = "\n" + question + + if num_patches_list is None: + num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else [] + assert pixel_values is None or len(pixel_values) == sum(num_patches_list) + + img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) + self.model.img_context_token_id = img_context_token_id + + template = get_conv_template(self.template) + template.system_message = self.system_message + + history = [] if history is None else history + for old_question, old_answer in history: + template.append_message(template.roles[0], old_question) + template.append_message(template.roles[1], old_answer) + template.append_message(template.roles[0], question) + template.append_message(template.roles[1], None) + query = template.get_prompt() + + if verbose and pixel_values is not None: + image_bs = pixel_values.shape[0] + print(f"dynamic ViT batch size: {image_bs}") + + for num_patches in num_patches_list: + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN + query = query.replace("", image_tokens, 1) + + return query diff --git a/QEfficient/transformers/models/InternVL/modeling_internvl_chat_vision.py b/QEfficient/transformers/models/InternVL/modeling_internvl_chat_vision.py new file mode 100755 index 000000000..075b76438 --- /dev/null +++ b/QEfficient/transformers/models/InternVL/modeling_internvl_chat_vision.py @@ -0,0 +1,945 @@ +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- + +import warnings +from typing import List, Optional, Tuple, Union + +import requests +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from einops import rearrange +from timm.models.layers import DropPath +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers import ( + GenerationConfig, + LlamaForCausalLM, + Qwen2ForCausalLM, +) +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging + +from QEfficient.utils import Constants, constants + +from .configuration_internvl_chat import InternVisionConfig, InternVLChatConfig +from .conversation import get_conv_template + +# from .modeling_intern_vl_vision import InternVisionModel, has_flash_attn + +has_flash_attn = False +logger = logging.get_logger(__name__) + + +def version_cmp(v1, v2, op="eq"): + import operator + + from packaging import version + + op_func = getattr(operator, op) + return op_func(version.parse(v1), version.parse(v2)) + + +class InternMLP(nn.Module): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.act = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class InternAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.use_flash_attn = config.use_flash_attn and has_flash_attn + if config.use_flash_attn and not has_flash_attn: + print("Warning: Flash Attention is not available, use_flash_attn is set to False.") + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + + self.scale = self.head_dim**-0.5 + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias) + self.attn_drop = nn.Dropout(config.attention_dropout) + self.proj_drop = nn.Dropout(config.dropout) + + self.qk_normalization = config.qk_normalization + + if self.qk_normalization: + self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps) + + # if self.use_flash_attn: + # self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout) + # self.proj = nn.Linear(self.embed_dim, self.embed_dim) + + def _naive_attn(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + if self.qk_normalization: + B_, H_, N_, D_ = q.shape + q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2) + k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2) + + attn = (q * self.scale) @ k.transpose(-2, -1) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def _flash_attn(self, x, key_padding_mask=None, need_weights=False): + qkv = self.qkv(x) + qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads) + + if self.qk_normalization: + q, k, v = qkv.unbind(2) + q = self.q_norm(q.flatten(-2, -1)).view(q.shape) + k = self.k_norm(k.flatten(-2, -1)).view(k.shape) + qkv = torch.stack([q, k, v], dim=2) + + context, _ = self.inner_attn(qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False) + outs = self.proj(rearrange(context, "b s h d -> b s (h d)")) + outs = self.proj_drop(outs) + return outs + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states) + return x + + +class InternRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +class InternVLChatModel(PreTrainedModel): + config_class = InternVLChatConfig + main_input_name = "pixel_values" + base_model_prefix = "language_model" + _supports_flash_attn_2 = True + _no_split_modules = ["InternVisionModel", "LlamaDecoderLayer", "Qwen2DecoderLayer"] + + def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True): + super().__init__(config) + + assert version_cmp(transformers.__version__, "4.37.0", "ge") + image_size = config.force_image_size or config.vision_config.image_size + patch_size = config.vision_config.patch_size + self.patch_size = patch_size + self.select_layer = config.select_layer + self.template = config.template + self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio**2)) + self.downsample_ratio = config.downsample_ratio + self.ps_version = config.ps_version + use_flash_attn = use_flash_attn if has_flash_attn else False + config.vision_config.use_flash_attn = True if use_flash_attn else False + config.llm_config._attn_implementation = "flash_attention_2" if use_flash_attn else "eager" + + logger.info(f"num_image_token: {self.num_image_token}") + logger.info(f"ps_version: {self.ps_version}") + if vision_model is not None: + self.vision_model = vision_model + else: + self.vision_model = InternVisionModel(config.vision_config) + if language_model is not None: + self.language_model = language_model + else: + if config.llm_config.architectures[0] == "LlamaForCausalLM": + self.language_model = LlamaForCausalLM(config.llm_config) + elif config.llm_config.architectures[0] == "Qwen2ForCausalLM": + self.language_model = Qwen2ForCausalLM(config.llm_config) + else: + raise NotImplementedError(f"{config.llm_config.architectures[0]} is not implemented.") + + vit_hidden_size = config.vision_config.hidden_size + llm_hidden_size = config.llm_config.hidden_size + + self.mlp1 = nn.Sequential( + nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2), + nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size), + nn.GELU(), + nn.Linear(llm_hidden_size, llm_hidden_size), + ) + + self.img_context_token_id = None + self.conv_template = get_conv_template(self.template) + self.system_message = self.conv_template.system_message + + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + image_flags: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + # import ipdb; ipdb.set_trace() + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + image_flags = image_flags.squeeze(-1) + input_embeds = self.language_model.get_input_embeddings()(input_ids).clone() + + vit_embeds = self.extract_feature(pixel_values) + vit_embeds = vit_embeds[image_flags == 1] + vit_batch_size = pixel_values.shape[0] + + B, N, C = input_embeds.shape + input_embeds = input_embeds.reshape(B * N, C) + + if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0: + print( + f"dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}" + ) + + input_ids = input_ids.reshape(B * N) + selected = input_ids == self.img_context_token_id + try: + input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C) + except Exception as e: + vit_embeds = vit_embeds.reshape(-1, C) + print( + f"warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, " + f"vit_embeds.shape={vit_embeds.shape}" + ) + n_token = selected.sum() + input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token] + + input_embeds = input_embeds.reshape(B, N, C) + + outputs = self.language_model( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def pixel_shuffle(self, x, scale_factor=0.5): + n, w, h, c = x.size() + # N, W, H, C --> N, W, H * scale, C // scale + x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) + # N, W, H * scale, C // scale --> N, H * scale, W, C // scale + x = x.permute(0, 2, 1, 3).contiguous() + # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) + x = x.view(n, int(h * scale_factor), int(w * scale_factor), int(c / (scale_factor * scale_factor))) + if self.ps_version == "v1": + warnings.warn( + "In ps_version 'v1', the height and width have not been swapped back, " + "which results in a transposed image." + ) + else: + x = x.permute(0, 2, 1, 3).contiguous() + return x + + def extract_feature(self, pixel_values): + if self.select_layer == -1: + vit_embeds = self.vision_model( + pixel_values=pixel_values, output_hidden_states=False, return_dict=True + ).last_hidden_state + else: + vit_embeds = self.vision_model( + pixel_values=pixel_values, output_hidden_states=True, return_dict=True + ).hidden_states[self.select_layer] + vit_embeds = vit_embeds[:, 1:, :] + + h = w = int(vit_embeds.shape[1] ** 0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + vit_embeds = self.mlp1(vit_embeds) + return vit_embeds + + def batch_chat( + self, + tokenizer, + pixel_values, + questions, + generation_config, + num_patches_list=None, + history=None, + return_history=False, + IMG_START_TOKEN="", + IMG_END_TOKEN="", + IMG_CONTEXT_TOKEN="", + verbose=False, + image_counts=None, + ): + if history is not None or return_history: + print("Now multi-turn chat is not supported in batch_chat.") + raise NotImplementedError + + if image_counts is not None: + num_patches_list = image_counts + print("Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.") + + img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) + self.img_context_token_id = img_context_token_id + + if verbose and pixel_values is not None: + image_bs = pixel_values.shape[0] + print(f"dynamic ViT batch size: {image_bs}") + + queries = [] + for idx, num_patches in enumerate(num_patches_list): + question = questions[idx] + if pixel_values is not None and "" not in question: + question = "\n" + question + template = get_conv_template(self.template) + template.system_message = self.system_message + template.append_message(template.roles[0], question) + template.append_message(template.roles[1], None) + query = template.get_prompt() + + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN + query = query.replace("", image_tokens, 1) + queries.append(query) + + tokenizer.padding_side = "left" + model_inputs = tokenizer(queries, return_tensors="pt", padding=True) + input_ids = model_inputs["input_ids"].to(self.device) + attention_mask = model_inputs["attention_mask"].to(self.device) + eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip()) + generation_config["eos_token_id"] = eos_token_id + generation_output = self.generate( + pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, **generation_config + ) + responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True) + responses = [response.split(template.sep.strip())[0].strip() for response in responses] + return responses + + def chat( + self, + tokenizer, + pixel_values, + question, + generation_config, + history=None, + return_history=False, + num_patches_list=None, + IMG_START_TOKEN="", + IMG_END_TOKEN="", + IMG_CONTEXT_TOKEN="", + verbose=False, + ): + if history is None and pixel_values is not None and "" not in question: + question = "\n" + question + + if num_patches_list is None: + num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else [] + assert pixel_values is None or len(pixel_values) == sum(num_patches_list) + + img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) + self.img_context_token_id = img_context_token_id + + template = get_conv_template(self.template) + template.system_message = self.system_message + eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip()) + + history = [] if history is None else history + for old_question, old_answer in history: + template.append_message(template.roles[0], old_question) + template.append_message(template.roles[1], old_answer) + template.append_message(template.roles[0], question) + template.append_message(template.roles[1], None) + query = template.get_prompt() + + if verbose and pixel_values is not None: + image_bs = pixel_values.shape[0] + print(f"dynamic ViT batch size: {image_bs}") + + for num_patches in num_patches_list: + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN + query = query.replace("", image_tokens, 1) + + model_inputs = tokenizer(query, return_tensors="pt") + input_ids = model_inputs["input_ids"].to(self.device) + attention_mask = model_inputs["attention_mask"].to(self.device) + generation_config["eos_token_id"] = eos_token_id + generation_output = self.generate( + pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, **generation_config + ) + response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0] + response = response.split(template.sep.strip())[0].strip() + history.append((question, response)) + if return_history: + return response, history + else: + query_to_print = query.replace(IMG_CONTEXT_TOKEN, "") + query_to_print = query_to_print.replace(f"{IMG_START_TOKEN}{IMG_END_TOKEN}", "") + if verbose: + print(query_to_print, response) + return response + + @torch.no_grad() + def generate( + self, + pixel_values: Optional[torch.FloatTensor] = None, + input_ids: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + visual_features: Optional[torch.FloatTensor] = None, + generation_config: Optional[GenerationConfig] = None, + output_hidden_states: Optional[bool] = None, + **generate_kwargs, + ) -> torch.LongTensor: + assert self.img_context_token_id is not None + if pixel_values is not None: + if visual_features is not None: + vit_embeds = visual_features + else: + vit_embeds = self.extract_feature(pixel_values) + input_embeds = self.language_model.get_input_embeddings()(input_ids) + B, N, C = input_embeds.shape + input_embeds = input_embeds.reshape(B * N, C) + + input_ids = input_ids.reshape(B * N) + selected = input_ids == self.img_context_token_id + assert selected.sum() != 0 + input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) + + input_embeds = input_embeds.reshape(B, N, C) + else: + input_embeds = self.language_model.get_input_embeddings()(input_ids) + + outputs = self.language_model.generate( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + generation_config=generation_config, + output_hidden_states=output_hidden_states, + use_cache=True, + **generate_kwargs, + ) + + return outputs + + +class QEffInternVLChatModel(InternVLChatModel): + config_class = InternVLChatConfig + main_input_name = "pixel_values" + base_model_prefix = "language_model" + _supports_flash_attn_2 = True + _no_split_modules = ["InternVisionModel", "LlamaDecoderLayer", "Qwen2DecoderLayer"] + + def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True): + super().__init__(config) + + assert version_cmp(transformers.__version__, "4.37.0", "ge") + image_size = config.force_image_size or config.vision_config.image_size + patch_size = config.vision_config.patch_size + self.patch_size = patch_size + self.select_layer = config.select_layer + self.template = config.template + self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio**2)) + self.downsample_ratio = config.downsample_ratio + self.ps_version = config.ps_version + use_flash_attn = use_flash_attn if has_flash_attn else False + config.vision_config.use_flash_attn = True if use_flash_attn else False + config.llm_config._attn_implementation = "flash_attention_2" if use_flash_attn else "eager" + + logger.info(f"num_image_token: {self.num_image_token}") + logger.info(f"ps_version: {self.ps_version}") + if vision_model is not None: + self.vision_model = vision_model + else: + self.vision_model = InternVisionModel(config.vision_config) + if language_model is not None: + self.language_model = language_model + else: + if config.llm_config.architectures[0] == "LlamaForCausalLM": + self.language_model = LlamaForCausalLM(config.llm_config) + elif config.llm_config.architectures[0] == "Qwen2ForCausalLM": + self.language_model = Qwen2ForCausalLM(config.llm_config) + else: + raise NotImplementedError(f"{config.llm_config.architectures[0]} is not implemented.") + + vit_hidden_size = config.vision_config.hidden_size + llm_hidden_size = config.llm_config.hidden_size + + self.mlp1 = nn.Sequential( + nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2), + nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size), + nn.GELU(), + nn.Linear(llm_hidden_size, llm_hidden_size), + ) + + self.img_context_token_id = None + self.conv_template = get_conv_template(self.template) + self.system_message = self.conv_template.system_message + + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + image_flags: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + input_embeds = self.language_model.get_input_embeddings()(input_ids).clone() + vit_embeds = self.extract_feature(pixel_values) + + B, N, C = input_embeds.shape + image_input_embeds = input_embeds.reshape(B * N, C) + + image_input_ids = input_ids.reshape(B * N) + selected = image_input_ids == self.img_context_token_id + indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1 + indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1) + image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1] + + image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds) + + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds) + + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + # return CausalLMOutputWithPast( + # loss=loss, + # logits=logits, + # past_key_values=outputs.past_key_values, + # hidden_states=outputs.hidden_states, + # attentions=outputs.attentions, + # ) + + outputs_final = CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + return outputs_final.logits, pixel_values, outputs_final.past_key_values + + def generate_inputs(self, processor, **kwargs): + num_layers = self.config.llm_config.num_hidden_layers + num_key_value_heads = self.config.llm_config.num_key_value_heads + head_dim = self.config.llm_config.hidden_size // self.config.llm_config.num_attention_heads + ctx_len = Constants.CTX_LEN_VLM_INTERN + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + pixel_values = [] + for i in range(1, 2): + url = f"https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-{i}-2048.jpg" + img = requests.get(url, stream=True).raw + pixel_values.append(processor.load_image(img, max_num=12)) + question = "\nPlease describe the image in detail." + pixel_values = torch.cat(pixel_values, dim=0) + query = processor(processor.tokenizer, pixel_values, question) + inputs = dict(processor.tokenizer(query, return_tensors="pt")) + inputs["pixel_values"] = pixel_values + inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) + inputs["past_key_values"] = [] + + for i in range(num_layers): + inputs["past_key_values"].append( + ( + torch.zeros(bs, num_key_value_heads, ctx_len, head_dim), + torch.zeros(bs, num_key_value_heads, ctx_len, head_dim), + ) + ) + output_names = [ + "logits", + "pixel_values_RetainedState", + *[f"past_{kv}.{i}_RetainedState" for i in range(num_layers) for kv in ["key", "value"]], + ] + dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len"}, + # "pixel_values": {0: "img_batch_size"}, + } + for i in range(num_layers): + dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} + dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + # Avoid issues due to index out of range + inputs["position_ids"] = torch.full(inputs["position_ids"].shape, ctx_len - 1) + return inputs, dynamic_axes, output_names + + +class InternVisionEmbeddings(nn.Module): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter( + torch.randn(1, 1, self.embed_dim), + ) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def _get_pos_embed(self, pos_embed, H, W): + target_dtype = pos_embed.dtype + pos_embed = ( + pos_embed.float() + .reshape(1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1) + .permute(0, 3, 1, 2) + ) + pos_embed = ( + F.interpolate(pos_embed, size=(H, W), mode="bicubic", align_corners=False) + .reshape(1, -1, H * W) + .permute(0, 2, 1) + .to(target_dtype) + ) + return pos_embed + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height] + batch_size, _, height, width = patch_embeds.shape + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + position_embedding = torch.cat( + [self.position_embedding[:, :1, :], self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)], + dim=1, + ) + embeddings = embeddings + position_embedding.to(target_dtype) + return embeddings + + +class QEffInternVisionEmbeddings(InternVisionEmbeddings): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter( + torch.randn(1, 1, self.embed_dim), + ) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def _get_pos_embed(self, pos_embed, H, W): + target_dtype = pos_embed.dtype + pos_embed = ( + pos_embed.float() + .reshape(1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1) + .permute(0, 3, 1, 2) + ) + pos_embed = ( + F.interpolate(pos_embed, size=(H, W), mode="bicubic", align_corners=False) + .reshape(1, -1, H * W) + .permute(0, 2, 1) + .to(target_dtype) + ) + return pos_embed + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height] + batch_size, _, height, width = patch_embeds.shape + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + + pos_embed = self.position_embedding[:, 1:, :] + target_dtype = pos_embed.dtype + pos_embed = ( + pos_embed.float() + .reshape(1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1) + .permute(0, 3, 1, 2) + ) + pos_embed = ( + F.interpolate(pos_embed, size=(height, width), mode="bilinear", align_corners=False) + .reshape(1, -1, height * width) + .permute(0, 2, 1) + .to(target_dtype) + ) + + position_embedding = torch.cat([self.position_embedding[:, :1, :], pos_embed], dim=1) + + embeddings = embeddings + position_embedding.to(target_dtype) + return embeddings + + +class InternVisionModel(PreTrainedModel): + main_input_name = "pixel_values" + _supports_flash_attn_2 = True + config_class = InternVisionConfig + _no_split_modules = ["InternVisionEncoderLayer"] + + def __init__(self, config: InternVisionConfig): + super().__init__(config) + self.config = config + + self.embeddings = InternVisionEmbeddings(config) + self.encoder = InternVisionEncoder(config) + + def resize_pos_embeddings(self, old_size, new_size, patch_size): + pos_emb = self.embeddings.position_embedding + _, num_positions, embed_dim = pos_emb.shape + cls_emb = pos_emb[:, :1, :] + pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2) + pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode="bicubic", align_corners=False) + pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1) + pos_emb = torch.cat([cls_emb, pos_emb], dim=1) + self.embeddings.position_embedding = nn.Parameter(pos_emb) + self.embeddings.image_size = new_size + logger.info("Resized position embeddings from {} to {}".format(old_size, new_size)) + + def get_input_embeddings(self): + return self.embeddings + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + pixel_embeds: Optional[torch.FloatTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None and pixel_embeds is None: + raise ValueError("You have to specify pixel_values or pixel_embeds") + + if pixel_embeds is not None: + hidden_states = pixel_embeds + else: + if len(pixel_values.shape) == 4: + hidden_states = self.embeddings(pixel_values) + else: + raise ValueError(f"wrong pixel_values size: {pixel_values.shape}") + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_state = encoder_outputs.last_hidden_state + pooled_output = last_hidden_state[:, 0, :] + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class InternVisionEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`InternEncoderLayer`]. + + Args: + config (`InternConfig`): + The corresponding vision configuration for the `InternEncoder`. + """ + + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] + self.layers = nn.ModuleList( + [InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = True + + def forward( + self, + inputs_embeds, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + hidden_states = inputs_embeds + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = torch.utils.checkpoint.checkpoint(encoder_layer, hidden_states) + else: + layer_outputs = encoder_layer( + hidden_states, + ) + hidden_states = layer_outputs + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states] if v is not None) + return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_states) + + +class InternVisionEncoderLayer(nn.Module): + def __init__(self, config: InternVisionConfig, drop_path_rate: float): + super().__init__() + self.embed_dim = config.hidden_size + self.intermediate_size = config.intermediate_size + self.norm_type = config.norm_type + + self.attn = InternAttention(config) + self.mlp = InternMLP(config) + self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) + self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) + + self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim)) + self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim)) + self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + + def forward( + self, + hidden_states: torch.Tensor, + ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]: + """ + Args: + hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)` + """ + hidden_states = hidden_states + self.drop_path1( + self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1 + ) + + hidden_states = hidden_states + self.drop_path2( + self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2 + ) + + return hidden_states + + +NORM2FN = { + "rms_norm": InternRMSNorm, + "layer_norm": nn.LayerNorm, +} diff --git a/QEfficient/transformers/models/llava/__init__.py b/QEfficient/transformers/models/llava/__init__.py new file mode 100755 index 000000000..da26921c5 --- /dev/null +++ b/QEfficient/transformers/models/llava/__init__.py @@ -0,0 +1,7 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py new file mode 100755 index 000000000..4f036838d --- /dev/null +++ b/QEfficient/transformers/models/llava/modeling_llava.py @@ -0,0 +1,256 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- +from typing import List, Optional, Tuple, Union + +import requests +import torch +import torch.utils.checkpoint +from PIL import Image +from torch import nn +from transformers.models.llava.modeling_llava import ( + LlavaCausalLMOutputWithPast, + LlavaForConditionalGeneration, + logger, +) + +from QEfficient.utils import Constants, constants + + +class QEffLlavaForConditionalGeneration(LlavaForConditionalGeneration): + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[int] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + ) -> Union[Tuple, LlavaCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + + Returns: + + Example: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, LlavaForConditionalGeneration + + >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf") + >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + + >>> prompt = "USER: \nWhat's the content of the image? ASSISTANT:" + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(**inputs, max_new_tokens=15) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed" + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if pixel_values is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + ) + + legacy_processing = False + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing + # not very reliable, but we don't expect one to actually pass 500+ images for one prompt + # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True + legacy_processing = ( + (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length + ) or (input_ids.shape[-1] == 1 and pixel_values is not None) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + ) + + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in LLaVa should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " + "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + # prefill stage vs decoding stage (legacy behavior copied) + if input_ids.shape[1] != 1: + inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( + image_features, inputs_embeds, input_ids, attention_mask, labels + ) + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) + else: + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + + # Get the target length + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, + ) + + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ + -target_length: + ] + + # TODO: @raushan retain only the new behavior after v4.47 + else: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + mask = input_ids == self.config.image_token_index + indices1 = mask.to(torch.int64).cumsum(1) - 1 + indices0 = torch.arange(mask.shape[0]).view(-1, 1) + image_features_expanded = image_features[indices0, indices1] + image_inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds) + # *where to skip image encoder for decode* + inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_inputs_embeds) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + ) + + logits = outputs[0] + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + if attention_mask is not None: + # we use the input attention mask to shift the logits and labels, because it is 2D. + # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft + shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device) + shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous() + shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous() + else: + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return logits, pixel_values, outputs.past_key_values + + def generate_inputs(self, processor, **kwargs): + num_layers = self.config.text_config.num_hidden_layers + num_key_value_heads = self.config.text_config.num_key_value_heads + head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads + ctx_len = Constants.CTX_LEN_VLM_INTERN + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + img = Image.open(requests.get(Constants.BASE_URL_LLAVA, stream=True).raw) + prompt = processor.apply_chat_template( + [{"role": "user", "content": [{"type": "text", "text": Constants.PROMPT_LLAVA}, {"type": "image"}]}], + add_generation_prompt=True, + ) + inputs = dict(processor(images=img, text=prompt, return_tensors="pt")) + inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1) + inputs["past_key_values"] = [] + for i in range(num_layers): + inputs["past_key_values"].append( + ( + torch.zeros(bs, num_key_value_heads, ctx_len, head_dim), + torch.zeros(bs, num_key_value_heads, ctx_len, head_dim), + ) + ) + inputs["position_ids"] = torch.full(inputs["position_ids"].shape, ctx_len - 1) + dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_len"}, + "position_ids": {0: "batch_size", 1: "seq_len"}, + "pixel_values": {0: "img_batch_size", 2: "img_size", 3: "img_size"}, + } + for i in range(num_layers): + dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"} + dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"} + + output_names = [ + "logits", + "pixel_values_RetainedState", + *[f"past_{kv}.{i}_RetainedState" for i in range(num_layers) for kv in ["key", "value"]], + ] + return inputs, dynamic_axes, output_names diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index c2e3777bc..37f6469be 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -14,16 +14,30 @@ import numpy as np import torch import torch.nn as nn -from transformers import AutoModel, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast +from transformers import ( + AutoConfig, + AutoModel, + AutoModelForCausalLM, + AutoModelForImageTextToText, + AutoProcessor, + AutoTokenizer, + PreTrainedTokenizer, + PreTrainedTokenizerFast, + TextStreamer, +) import QEfficient from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession +from QEfficient.generation.text_generation_inference import write_io_files +from QEfficient.transformers.models.InternVL.internprocessor import InternProcessor from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform -from QEfficient.utils import constants, get_padding_shape_from_config +from QEfficient.utils import Constants, constants, get_padding_shape_from_config + +# from QEfficient.transformers.models.phi3_vision.modeling_phi3_vision import Phi3VModelWrapper from QEfficient.utils.cache import to_hashable logger = logging.getLogger(__file__) @@ -421,6 +435,440 @@ def generate( raise NotImplementedError("Only AI_100 runtime is supported right now via generate API") +class QEFFAutoModelForImageTextToText(QEFFTransformersBase): + """ + The QEFF class is designed for manipulating any causal language model from the HuggingFace hub. + Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization. + + ``Mandatory`` Args: + :model (nn.Module): PyTorch model + :continuous_batching (bool): Weather this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. + :is_tlm (bool): Whether this is a Speculative Decoding Target Language Model. If set to True, `num_logits_to_keep` input array will have to be fed to control the number of returned logits during prefill/decode. + + + .. code-block:: python + + from QEfficient import QEFFAutoModelForImageTextToText + from transformers import AutoTokenizer + + model_name = "llava" + model = QEFFAutoModelForCausalLM.from_pretrained(model_name, num_hidden_layers=2) + model.compile(prefill_seq_len=1024, ctx_len=1280, num_cores=16, num_devices=1) + + processor = AutoProcessor.from_pretrained(model_name) + model.generate(inputs, streamer, device_ids, is) + """ + + _hf_auto_class = AutoModelForImageTextToText + _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform] + _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + + def __init__( + self, + model: nn.Module, + continuous_batching: bool = False, + is_tlm: bool = False, + **kwargs, + ): + if continuous_batching: + raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + if is_tlm: + raise NotImplementedError("Speculative Decoding is not supported for image-text-to-text models yet.") + + if kwargs.pop("full_batch_size", None): + raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + + super().__init__(model) + if model.config.architectures[0] == "InternVLChatModel": + self.model.config.llm_config.use_cache = True + self.num_layers = model.config.llm_config.num_hidden_layers + self.num_key_value_heads = model.config.llm_config.num_key_value_heads + self.head_dim = model.config.llm_config.hidden_size // model.config.llm_config.num_attention_heads + self.pad_token_id = model.config.llm_config.pad_token_id + self.ctx_len = Constants.CTX_LEN_VLM_INTERN + elif model.config.architectures[0] == "LlavaForConditionalGeneration": + self.model.config.use_cache = True + self.num_layers = model.config.text_config.num_hidden_layers + self.num_key_value_heads = model.config.text_config.num_key_value_heads + self.head_dim = model.config.text_config.hidden_size // model.config.text_config.num_attention_heads + self.pad_token_id = model.config.pad_token_id + self.ctx_len = Constants.CTX_LEN_VLM_LLAVA + + self.continuous_batching = continuous_batching + self.is_tlm = is_tlm + self.kwargs = kwargs + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path, + continuous_batching: bool = False, + is_tlm: bool = False, + kv_offload: bool = False, + *args, + **kwargs, + ): + """ + This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM. + Once the model is initialized, you can use other methods such as export, compile, and generate on the same object. + + Args: + :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory. + :continuous_batching (bool): Whether this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later. + :is_tlm (bool): Whether this is a Speculative Decoding Target Language Model. If set to True, `num_logits_to_keep` input array will have to be fed to control the number of returned logits during prefill/decode. + :args, kwargs: Additional arguments to pass to transformers.AutoModelForCausalLM. + + .. code-block:: python + + from QEfficient import QEFFAutoModelForCausalLM + from transformers import AutoTokenizer + + # Initialize the model using from_pretrained similar to transformers.AutoModelForCausalLM + model_name = "gpt2" + model = QEFFAutoModelForCausalLM.from_pretrained(model_name) + + # Now you can directly compile the model for Cloud AI 100 + model.compile(num_cores=16) # Considering you have a Cloud AI 100 Standard SKU + + # You can now execute the model + tokenizer = AutoTokenizer.from_pretrained(model_name) + model.generate(prompts=["Hi there!!"], tokenizer=tokenizer) + """ + if kwargs.pop("full_batch_size", None): + raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + if pretrained_model_name_or_path == "OpenGVLab/InternVL2_5-1B": + cls._hf_auto_class = AutoModelForCausalLM + + # TODO : remove below after testing + model_config = {"model_name": pretrained_model_name_or_path} + model_config["n_layer_text"] = 1 + model_config["n_layer_vision"] = 1 + if model_config["model_name"] == "OpenGVLab/InternVL2_5-1B": + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) + config.llm_config.use_cache = True + config.llm_config.num_hidden_layers = model_config["n_layer_text"] + config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + config.llm_config._attn_implementation = "eager" + config.vision_config.use_flash_attn = "false" + elif model_config["model_name"] == "llava-hf/llava-1.5-7b-hf": + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) + config.text_config.num_hidden_layers = model_config["n_layer_text"] + config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + config._attn_implementation = "eager" + config.vision_config.use_flash_attn = "false" + # TODO : remove above after testing + self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, config=config, *args, **kwargs) + self.continuous_batching = continuous_batching + + if pretrained_model_name_or_path == "OpenGVLab/InternVL2_5-1B": + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=True, use_fast=False + ) + self.processor = InternProcessor(self.model, tokenizer) + else: + self.processor = AutoProcessor.from_pretrained( + pretrained_model_name_or_path, padding_side="right", trust_remote_code=True + ) + self.tokenizer = self.processor.tokenizer + self.kv_offload = kv_offload + self.is_tlm = is_tlm + return self + + @property + def model_hash(self) -> str: + # Compute the hash with: model_config, continuous_batching, transforms + mhash = hashlib.sha256() + mhash.update(to_hashable(self.model.config.to_diff_dict())) + mhash.update(to_hashable({"continuous_batching": self.continuous_batching})) + mhash.update(to_hashable({"is_tlm": self.is_tlm})) + mhash.update(to_hashable(self._transform_names())) + mhash = mhash.hexdigest()[:16] + return mhash + + def export( + self, + export_dir: Optional[str] = None, + **kwargs, + ) -> str: + """ + Exports the model to ``ONNX`` format using ``torch.onnx.export``. + + ``Optional`` Args: + :export_dir (str, optional): The directory path to store ONNX-graph. + :**kwargs: Keyword arguments for ``_generate_inputs``. If "ctx_len" is passed, it will be used as the context length. Otherwise, it will be set to 1280. + + Returns: + :str: Path of the generated ``ONNX`` graph. + """ + + example_inputs, dynamic_axes, output_names = self.model.generate_inputs(self.processor, **kwargs) + return self._export( + example_inputs, + output_names, + dynamic_axes, + export_dir=export_dir, + ) + + def compile( + self, + onnx_path: Optional[str] = None, + compile_dir: Optional[str] = None, + *, + prefill_seq_len: int = 1024, + ctx_len: int = 1280, + batch_size: int = 1, + full_batch_size: Optional[int] = None, + kv_cache_batch_size: Optional[int] = None, + num_devices: int = 1, + num_cores: int = 16, # FIXME: Make this mandatory arg + mxfp6_matmul: bool = False, + mxint8_kv_cache: bool = False, + num_speculative_tokens: Optional[int] = None, + enable_qnn: bool = False, + qnn_config: Optional[str] = None, + **compiler_options, + ) -> str: + """ + This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package. + If the model has not been exported yet, this method will handle the export process. + You can pass any other arguments that the `qaic-exec` takes as extra kwargs. + + ``Optional`` Args: + :onnx_path (str, optional): Path to pre-exported onnx model. + :compile_dir (str, optional): Path for saving the qpc generated. + :num_cores (int): Number of cores used to compile the model. + :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. + :batch_size (int, optional): Batch size. ``Defaults to 1``. + :prefill_seq_len (int, optional): The length of the Prefill prompt should be less that ``prefill_seq_len``. ``Defaults to 32``. + :ctx_len (int, optional): Maximum ``ctx`` that the compiled model can remember. ``Defaults to 128``. + :full_batch_size (int, optional): Continuous batching batch size. + :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. + :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``. + :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. + :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. + :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` + :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` + + Returns: + :str: Path of the compiled ``qpc`` package. + """ + + kv_cache_batch_size = ( + kv_cache_batch_size if kv_cache_batch_size else (full_batch_size if full_batch_size else batch_size) + ) + # Define prefill specialization + prefill_specialization = { + # Prefill is always run with single BS for continuous batching. + "batch_size": 1 if self.continuous_batching else batch_size, + "seq_len": prefill_seq_len, + "ctx_len": ctx_len, + "img_batch_size": 1, + "img_size": Constants.IMG_SIZE, + # TODO: should be renamed to kv_cache_batch_size in specialzation too + } + prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ... + if self.continuous_batching: + prefill_specialization.update({"full_batch_size": kv_cache_batch_size}) + else: + prefill_specialization.update({"batch_size": kv_cache_batch_size}) + prefill_specialization.update({"full_batch_exec_size": full_batch_size}) if full_batch_size else ... + specializations = [ + prefill_specialization, + ] + + # Skip decode specialization if we are not in continuous batching and prefill_seq_len=1 as this repeats prefill specialization + if prefill_seq_len != 1 or self.continuous_batching: + decode_specialization = { + "batch_size": full_batch_size if self.continuous_batching else batch_size, + "seq_len": num_speculative_tokens + 1 if self.is_tlm else 1, + "ctx_len": ctx_len, + "img_batch_size": 1, + "img_size": Constants.IMG_SIZE, + } + if self.continuous_batching: + decode_specialization.update({"full_batch_size": kv_cache_batch_size}) + else: + decode_specialization.update({"batch_size": kv_cache_batch_size}) + decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ... + specializations.append(decode_specialization) + + if enable_qnn: + if compiler_options: + logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only") + + qpc_path = self._qnn_compile( + onnx_path, + compile_dir, + specializations=specializations, + prefill_seq_len=prefill_seq_len, + ctx_len=ctx_len, + batch_size=batch_size, + full_batch_size=full_batch_size, + mdp_ts_num_devices=num_devices, + num_cores=num_cores, + mxfp6_matmul=mxfp6_matmul, + mxint8_kv_cache=mxint8_kv_cache, + qnn_config=qnn_config, + ) + else: + # Custom IO + custom_io = {} + kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16" + custom_io["pixel_values"] = kv_cache_dtype + custom_io["pixel_values_RetainedState"] = kv_cache_dtype + for suffix in ["", "_RetainedState"]: + for i in range(self.num_layers): + for kv in ["key", "value"]: + custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype + + qpc_path = self._compile( + onnx_path, + compile_dir, + compile_only=True, + retained_state=True, + specializations=specializations, + convert_to_fp16=True, + mxfp6_matmul=mxfp6_matmul, + custom_io=custom_io, + mdp_ts_num_devices=num_devices, + num_speculative_tokens=num_speculative_tokens, + aic_num_cores=num_cores, + **compiler_options, + ) + return qpc_path + + def generate( + self, + inputs: torch.Tensor, + streamer: Optional[TextStreamer], + device_ids: List[int] = None, + runtime_ai100: bool = True, + ) -> Union[torch.Tensor, np.ndarray]: + """ + This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + ``Mandatory`` Args: + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + ``optional`` Args: + :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model + :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime. + Returns: + :dict: Output from the ``AI_100`` or ``PyTorch`` runtime. + """ + + # AI_100 runtime + if runtime_ai100: + if not isinstance(self.qpc_path, Path): + raise TypeError("Please run compile API first!") + + return self.cloud_ai_100_vlm_generate(inputs=inputs, device_ids=device_ids) + + def cloud_ai_100_vlm_generate( + self, + inputs: torch.Tensor, + device_ids: List[int] = [0], + streamer: TextStreamer = None, + write_io_dir: Optional[str] = None, + ) -> np.ndarray: + """ + Generates features with list of prompts using AI 100 runtime. + + ``Mandatory`` Args: + :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution. + ``Optional`` Args: + device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0]. + + Returns: + np.ndarray: A list of dictionaries containing the generated output features. + """ + + if self.qpc_session is None: + self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids) + self.batch_size = self.qpc_session.bindings[0].dims[0] + self.seq_len = self.qpc_session.bindings[0].dims[1] + # Skip inputs/outputs + self.qpc_session.skip_buffers( + [x for x in self.qpc_session.input_names + self.qpc_session.output_names if x.startswith("past_")] + + ["pixel_values_RetainedState", "image_sizes_RetainedState"] + ) + + # Read prompt and ctx len from session + batch_size = max( + [x[self.qpc_session.binding_index_map["input_ids"]][1][0] for x in self.qpc_session.allowed_shapes] + + [self.qpc_session.bindings[self.qpc_session.binding_index_map["input_ids"]].dims[0]] + ) + prefill_seq_len = max( + [x[self.qpc_session.binding_index_map["input_ids"]][1][1] for x in self.qpc_session.allowed_shapes] + + [self.qpc_session.bindings[self.qpc_session.binding_index_map["input_ids"]].dims[1]] + ) + input_len = inputs["attention_mask"].sum(1, keepdims=True) + padded_len = inputs["input_ids"].shape[1] + + num_chunks = -(padded_len // -prefill_seq_len) # ceil divide without float + padded_len = num_chunks * prefill_seq_len # Convert to a multiple of prompt_len + generation_len = self.ctx_len - input_len.max() # in standalone this is tensor + assert generation_len > 0, "generation length should be greater than zero" + generated_ids = np.full((batch_size, generation_len + 1), self.processor.tokenizer.pad_token_id) + + input_ids = inputs["input_ids"] + input_ids_size = input_ids.shape[1] + inputs["input_ids"] = torch.nn.functional.pad( + inputs["input_ids"], + (0, Constants.SEQ_LEN_VLM - input_ids_size), + "constant", + self.processor.tokenizer.pad_token_id, + ) + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], (0, Constants.SEQ_LEN_VLM - input_ids_size), "constant", 0 + ) + for k, v in inputs.items(): + inputs[k] = np.array(v) + + inputs["pixel_values"] = inputs["pixel_values"].astype("float16") + inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) + for i in range(num_chunks): + chunk_inputs = inputs.copy() + chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] + chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len] + outputs = self.qpc_session.run(chunk_inputs) + if write_io_dir: + write_io_files(inputs, outputs, write_io_dir, "prefill", "aic_batch_io", True, False) + + # Get first token + inputs["input_ids"] = outputs["logits"].argmax(2) + inputs["position_ids"] = input_len.numpy() + generated_ids[:, 0] = inputs["input_ids"].squeeze(1) + finished_sequences = inputs["input_ids"] == self.processor.tokenizer.eos_token_id + if streamer is not None: + streamer.put(inputs["input_ids"][0]) + self.qpc_session.skip_buffers(["pixel_values"]) + self.qpc_session.skip_buffers(["image_sizes"]) + inputs.pop("pixel_values") + inputs.pop("image_sizes", None) + + # Decode loop + generation_len = torch.tensor(generation_len) + for num_token in range(1, generation_len): + outputs = self.qpc_session.run(inputs) + if write_io_dir: + write_io_files(inputs, outputs, write_io_dir, "decode", "aic_batch_io", True, False) + write_io_dir = None + + # Prepare inputs for next iteration + inputs["input_ids"] = outputs["logits"].argmax(2) + + inputs["position_ids"] += 1 + generated_ids[:, num_token] = inputs["input_ids"].squeeze(1) + + finished_sequences |= inputs["input_ids"] == self.processor.tokenizer.eos_token_id + if streamer is not None: + streamer.put(inputs["input_ids"][0]) + if finished_sequences.all(): + break + generated_texts = self.processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + return generated_texts + + class QEFFAutoModel(QEFFTransformersBase): """ The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub. diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 6b8d00689..9f2fc466a 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -51,6 +51,9 @@ LlamaModel, LlamaRMSNorm, ) +from transformers.models.llava.modeling_llava import ( + LlavaForConditionalGeneration, +) from transformers.models.mistral.modeling_mistral import ( MistralAttention, MistralDecoderLayer, @@ -143,12 +146,19 @@ QEffGPTJForCausalLM, QEffGPTJModel, ) +from QEfficient.transformers.models.InternVL.modeling_internvl_chat_vision import ( + QEffInternVisionEmbeddings, + QEffInternVLChatModel, +) from QEfficient.transformers.models.llama.modeling_llama import ( QEffLlamaAttention, QEffLlamaDecoderLayer, QEffLlamaForCausalLM, QEffLlamaModel, ) +from QEfficient.transformers.models.llava.modeling_llava import ( + QEffLlavaForConditionalGeneration, +) from QEfficient.transformers.models.mistral.modeling_mistral import ( QEffMistralAttention, QEffMistralDecoderLayer, @@ -253,6 +263,8 @@ class KVCacheTransform(ModuleMappingTransform): Gemma2DecoderLayer: QEffGemma2DecoderLayer, Gemma2Model: QEffGemma2Model, Gemma2ForCausalLM: QEffGemma2ForCausalLM, + # Llava + LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration, # mllama MllamaForCausalLM: QEffMllamaForCausalLM, MllamaTextModel: QEffMllamaTextModel, @@ -301,6 +313,9 @@ class KVCacheTransform(ModuleMappingTransform): GPTBigCodeBlock: QEffGPTBigCodeBlock, GPTBigCodeModel: QEffGPTBigCodeModel, GPTBigCodeForCausalLM: QEffGPTBigCodeForCausalLM, + # InternVL + "InternVLChatModel": QEffInternVLChatModel, + "InternVisionEmbeddings": QEffInternVisionEmbeddings, } @classmethod diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 2506b9233..48a4cc462 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -10,10 +10,14 @@ undo_transformers_quantizers, ) from QEfficient.utils._utils import ( # noqa: F401 + Constants, check_and_assign_cache_dir, + get_conv_template, get_num_layers_from_config, + get_num_layers_vlm, get_onnx_dir_name, get_padding_shape_from_config, + get_padding_shape_vlm, get_qpc_dir_path, hf_download, load_hf_tokenizer, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 2729267d6..99effb241 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -324,6 +324,20 @@ def get_num_layers_from_config(config): return n_layer +def get_num_layers_vlm(config): + if hasattr(config, "architectures") and "LlavaForConditionalGeneration" in config.architectures: + num_layers = config.text_config.num_hidden_layers + return num_layers + + +def get_padding_shape_vlm(config, batch_size=1): + if hasattr(config, "architectures") and "LlavaForConditionalGeneration" in config.architectures: + n_heads = config.text_config.num_key_value_heads + d_head = config.text_config.hidden_size // config.text_config.num_attention_heads + padding_shape = [batch_size, n_heads, Constants.CTX_LEN_VLM, d_head] + return padding_shape + + def execute_command(process: str, command: str, output_file_path: Optional[str] = None): """ Executes the give command using subprocess. diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index cc64df4bd..7c51983cc 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -57,12 +57,21 @@ class Constants: # Export Constants. SEQ_LEN = 32 CTX_LEN = 32 + SEQ_LEN_VLM = 3072 + CTX_LEN_VLM_INTERN = 4096 PROMPT_LEN = 8 INPUT_STR = ["My name is"] GB = 2**30 MAX_QPC_LIMIT = 30 MAX_RETRIES = 5 # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download NUM_SPECULATIVE_TOKENS = 2 + CTX_LEN_VLM_LLAVA = 1280 + IMG_SIZE = 336 + PL_VLM = 1024 + BASE_URL_LLAVA = ( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg" + ) + PROMPT_LLAVA = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud" @dataclass diff --git a/tests/transformers/models/test_image_text_to_text.py b/tests/transformers/models/test_image_text_to_text.py new file mode 100755 index 000000000..294d6a1ff --- /dev/null +++ b/tests/transformers/models/test_image_text_to_text.py @@ -0,0 +1,211 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + + +# from PIL import Image +import pytest +import requests +import torch + +# For intern Specific +from PIL import Image +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoModelForImageTextToText, + AutoModelForVision2Seq, + AutoProcessor, + AutoTokenizer, + TextStreamer, +) +from transformers.image_utils import load_image + +from QEfficient.transformers.models.InternVL.internprocessor import InternProcessor +from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForImageTextToText +from QEfficient.utils import hf_download +from QEfficient.utils.constants import Constants +from QEfficient.utils.device_utils import get_available_device_id + +test_models = [ + "llava-hf/llava-1.5-7b-hf", + "OpenGVLab/InternVL2_5-1B", +] + + +def load_vlm_model(model_config): + """ + Function to load model from huggingface and transform to KV model + -------- + + :model_config: Dict + + :return model_hf, params + """ + + if model_config["model_name"] == "OpenGVLab/InternVL2_5-1B": + config = AutoConfig.from_pretrained(model_config["model_path"]) + config.llm_config.use_cache = True + config.llm_config.num_hidden_layers = model_config["n_layer_text"] + config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + config.llm_config._attn_implementation = "eager" + config.vision_config.use_flash_attn = "false" + model_hf = AutoModelForCausalLM.from_pretrained( + model_config["model_path"], low_cpu_mem_usage=False, config=config + ) + elif model_config["model_name"] == "llava-hf/llava-1.5-7b-hf": + config = AutoConfig.from_pretrained(model_config["model_path"]) + config.text_config.num_hidden_layers = model_config["n_layer_text"] + config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + config._attn_implementation = "eager" + config.vision_config.use_flash_attn = "false" + model_hf = AutoModelForImageTextToText.from_pretrained( + model_config["model_path"], low_cpu_mem_usage=False, config=config + ) + elif model_config["model_name"] == "HuggingFaceTB/SmolVLM-256M-Instruct": + config = AutoConfig.from_pretrained(model_config["model_path"]) + config.text_config.num_hidden_layers = model_config["n_layer_text"] + config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + config._attn_implementation = "eager" + config.vision_config.use_flash_attn = "false" + model_hf = AutoModelForVision2Seq.from_pretrained( + model_config["model_path"], low_cpu_mem_usage=False, config=config + ) + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + + +def generate_hf_inputs_smol(model_name, model, processor): + image = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") + messages = [ + {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Can you describe this image?"}]}, + ] + # Prepare inputs + prompt = processor.apply_chat_template(messages, add_generation_prompt=True) + inputs = processor(text=prompt, images=[image], return_tensors="pt") + return inputs + + +def generate_hf_inputs_intern(model_name, model, processor): + ## PREPROCESSING THE MULTI-MODAL INPUTS + pixel_values = [] + for i in range(1, 2): + url = f"https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-{i}-2048.jpg" + img = requests.get(url, stream=True).raw + pixel_values.append(processor.load_image(img, max_num=12)) + + question = "\nPlease describe the image in detail." + pixel_values = torch.cat(pixel_values, dim=0) + query = processor(processor.tokenizer, pixel_values, question) + inputs = dict(processor.tokenizer(query, return_tensors="pt")) + inputs["pixel_values"] = pixel_values + return inputs + + +def generate_hf_inputs_llava(model_name, model, processor=None): + img = Image.open(requests.get(Constants.BASE_URL_LLAVA, stream=True).raw) + prompt = processor.apply_chat_template( + [{"role": "user", "content": [{"type": "text", "text": Constants.PROMPT_LLAVA}, {"type": "image"}]}], + add_generation_prompt=True, + ) + inputs = processor(images=img, text=prompt, return_tensors="pt") + # inputs["processor"] = processor + return inputs + + +# --------------------------------------------- +# Please Add new models here inside the map +# {model_name:generate_hf_inputs_} +# --------------------------------------------- +generate_hf_inputs_func_map = { + "llava-hf/llava-1.5-7b-hf": generate_hf_inputs_llava, + "OpenGVLab/InternVL2_5-1B": generate_hf_inputs_intern, + "HuggingFaceTB/SmolVLM-256M-Instruct": generate_hf_inputs_smol, +} + + +def generate_hf_inputs(model_name, model, processor=None): + generate_func = generate_hf_inputs_func_map.get(model_name) + if not generate_func: + raise ValueError(f"Input generation function for model {model_name} not found.") + + return generate_func(model_name, model, processor) + + +def check_vlm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name: str, + prompt_len: int = Constants.SEQ_LEN_VLM, + ctx_len: int = Constants.CTX_LEN_VLM_INTERN, + n_layer_text: int = 1, + n_layer_vision: int = 1, + # num_speculative_tokens: Optional[int] = None, +): + """ + Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``Phi-3.5-vision-instruct`` + :prompt_len (int): Prompt length for the model to compile. + :ctx_len (int): Maximum context length to compile the model. + :n_layers (int): Number of layers for the Model. + """ + + model_config = {"model_name": model_name} + model_config["n_layer_text"] = n_layer_text + model_config["n_layer_vision"] = n_layer_vision + + model_path = hf_download( + repo_id=model_config["model_name"], + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + + model_config["model_path"] = model_path + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) + model_hf, _ = load_vlm_model(model_config) + + # Load processor for models + + if model_name == "OpenGVLab/InternVL2_5-1B": + processor = InternProcessor(model_hf, tokenizer) + else: + processor = AutoProcessor.from_pretrained(model_name, padding_side="right", trust_remote_code=True) + + streamer = TextStreamer(tokenizer) + inputs = generate_hf_inputs(model_name, model_hf, processor) + + qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(model_name) + qeff_model.export() + if not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + qpc_path = qeff_model.compile( + prefill_seq_len=prompt_len, + ctx_len=ctx_len, + num_cores=14, + mxfp6=False, + aic_enable_depth_first=False, + ) + + qeff_model.qpc_path = qpc_path + qeff_model.generate(inputs, streamer, device_ids=None, runtime_ai100=True) + + +@pytest.mark.on_qaic +@pytest.mark.parametrize("model_name", test_models) +def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): + """ + Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. + ``Mandatory`` Args: + :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` + """ + + n_layer_text = 1 + n_layer_vision = 1 + + check_vlm_pytorch_vs_kv_vs_ort_vs_ai100( + model_name=model_name, n_layer_text=n_layer_text, n_layer_vision=n_layer_vision + ) From e2199e54814e2a8b453ac77e1b266604ddd4b19a Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Wed, 5 Feb 2025 19:23:03 +0000 Subject: [PATCH 2/4] VLM Pipeline for onboarding of VLMs Signed-off-by: Dipankar Sarkar --- QEfficient/transformers/models/InternVL/internprocessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/InternVL/internprocessor.py b/QEfficient/transformers/models/InternVL/internprocessor.py index 114271324..7039dd13b 100644 --- a/QEfficient/transformers/models/InternVL/internprocessor.py +++ b/QEfficient/transformers/models/InternVL/internprocessor.py @@ -4,7 +4,7 @@ from PIL import Image from torchvision.transforms.functional import InterpolationMode -from QEfficient.utils import get_conv_template +from QEfficient.transformers.models.InternVL.conversation import get_conv_template # from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers From 9efb97efea706e2fc3d6fb3f1b19d337e10e8e78 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Thu, 6 Feb 2025 01:07:18 +0000 Subject: [PATCH 3/4] Change of conversation import over files Signed-off-by: Dipankar Sarkar --- QEfficient/transformers/models/InternVL/internprocessor.py | 4 ---- QEfficient/utils/__init__.py | 1 - 2 files changed, 5 deletions(-) diff --git a/QEfficient/transformers/models/InternVL/internprocessor.py b/QEfficient/transformers/models/InternVL/internprocessor.py index 7039dd13b..ba222244b 100644 --- a/QEfficient/transformers/models/InternVL/internprocessor.py +++ b/QEfficient/transformers/models/InternVL/internprocessor.py @@ -6,10 +6,6 @@ from QEfficient.transformers.models.InternVL.conversation import get_conv_template -# from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers - -# from QEfficient.utils._utils import load_hf_processor - IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 48a4cc462..48ecee2bb 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -12,7 +12,6 @@ from QEfficient.utils._utils import ( # noqa: F401 Constants, check_and_assign_cache_dir, - get_conv_template, get_num_layers_from_config, get_num_layers_vlm, get_onnx_dir_name, From f55503a8bba258d618e206c1da7cf63a29289c16 Mon Sep 17 00:00:00 2001 From: Dipankar Sarkar Date: Mon, 10 Feb 2025 19:57:31 +0000 Subject: [PATCH 4/4] Fixing of intern full model output Signed-off-by: Dipankar Sarkar --- .../transformers/models/modeling_auto.py | 16 +- .../models/qwen2/modeling_qwen2.py | 156 ++++++++++++++++-- .../models/test_image_text_to_text.py | 34 +--- 3 files changed, 158 insertions(+), 48 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 37f6469be..cca4a2e73 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -541,19 +541,19 @@ def from_pretrained( # TODO : remove below after testing model_config = {"model_name": pretrained_model_name_or_path} - model_config["n_layer_text"] = 1 - model_config["n_layer_vision"] = 1 + # model_config["n_layer_text"] = 1 + # model_config["n_layer_vision"] = 1 if model_config["model_name"] == "OpenGVLab/InternVL2_5-1B": config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) config.llm_config.use_cache = True - config.llm_config.num_hidden_layers = model_config["n_layer_text"] - config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + # config.llm_config.num_hidden_layers = model_config["n_layer_text"] + # config.vision_config.num_hidden_layers = model_config["n_layer_vision"] config.llm_config._attn_implementation = "eager" config.vision_config.use_flash_attn = "false" elif model_config["model_name"] == "llava-hf/llava-1.5-7b-hf": config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) - config.text_config.num_hidden_layers = model_config["n_layer_text"] - config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + # config.text_config.num_hidden_layers = model_config["n_layer_text"] + # config.vision_config.num_hidden_layers = model_config["n_layer_vision"] config._attn_implementation = "eager" config.vision_config.use_flash_attn = "false" # TODO : remove above after testing @@ -762,7 +762,8 @@ def generate( raise TypeError("Please run compile API first!") return self.cloud_ai_100_vlm_generate(inputs=inputs, device_ids=device_ids) - + + def cloud_ai_100_vlm_generate( self, inputs: torch.Tensor, @@ -866,6 +867,7 @@ def cloud_ai_100_vlm_generate( if finished_sequences.all(): break generated_texts = self.processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + breakpoint() return generated_texts diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py index a8562ca1f..5db295e85 100644 --- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py +++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py @@ -19,18 +19,132 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) + + +from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS from transformers.models.qwen2.modeling_qwen2 import ( Qwen2Attention, + Qwen2Config, Qwen2DecoderLayer, Qwen2ForCausalLM, Qwen2Model, + Qwen2RotaryEmbedding, apply_rotary_pos_emb, logger, repeat_kv, + rotate_half ) + + from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask +def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + + # Apply rotation + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + # Cast back to original dtype + return q_embed.to(q.dtype), k_embed.to(k.dtype) + + +class QEffQwen2RotaryEmbedding(Qwen2RotaryEmbedding): + """ + Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py + The only differences are: + - Add static sin/cos computations. + """ + + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[Qwen2Config] = None, + ): + super(Qwen2RotaryEmbedding, self).__init__() # Initialize nn.Module + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.45" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=self.original_max_seq_len, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq) + + freqs = torch.outer(t, self.inv_freq) + + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling, + self.sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling, + ) + class QEffQwen2Attention(Qwen2Attention): """ @@ -39,6 +153,17 @@ class QEffQwen2Attention(Qwen2Attention): - add new args position idx for the cache_kwargs for kv retention """ + def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + # Define the general __qeff_init__() for any changes in the init calls + # Set the init in the module mapping pytorch transforms + self.config = config + self.__qeff_init__() + + def __qeff_init__(self): + self.rotary_emb = QEffQwen2RotaryEmbedding(config=self.config) + + def forward( self, hidden_states: torch.Tensor, @@ -71,18 +196,21 @@ def forward( ) kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - if position_embeddings is None: - logger.warning_once( - "The attention layers in this model are transitioning from computing the RoPE embeddings internally " - "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " - "removed and `position_embeddings` will be mandatory." - ) - cos, sin = self.rotary_emb(value_states, position_ids) - else: - cos, sin = position_embeddings + # if position_embeddings is None: + # logger.warning_once( + # "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + # "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + # "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + # "removed and `position_embeddings` will be mandatory." + # ) + # cos, sin = self.rotary_emb(value_states, position_ids) + # else: + # cos, sin = position_embeddings + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: # Update the cache_kwargs with position_ids for Cloud AI 100 @@ -116,7 +244,7 @@ def forward( ) attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) attn_output = self.o_proj(attn_output) @@ -177,9 +305,13 @@ def forward( "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)" ) + + if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 cache_position = torch.arange( diff --git a/tests/transformers/models/test_image_text_to_text.py b/tests/transformers/models/test_image_text_to_text.py index 294d6a1ff..f28b6311e 100755 --- a/tests/transformers/models/test_image_text_to_text.py +++ b/tests/transformers/models/test_image_text_to_text.py @@ -49,8 +49,8 @@ def load_vlm_model(model_config): if model_config["model_name"] == "OpenGVLab/InternVL2_5-1B": config = AutoConfig.from_pretrained(model_config["model_path"]) config.llm_config.use_cache = True - config.llm_config.num_hidden_layers = model_config["n_layer_text"] - config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + # config.llm_config.num_hidden_layers = model_config["n_layer_text"] + # config.vision_config.num_hidden_layers = model_config["n_layer_vision"] config.llm_config._attn_implementation = "eager" config.vision_config.use_flash_attn = "false" model_hf = AutoModelForCausalLM.from_pretrained( @@ -58,40 +58,19 @@ def load_vlm_model(model_config): ) elif model_config["model_name"] == "llava-hf/llava-1.5-7b-hf": config = AutoConfig.from_pretrained(model_config["model_path"]) - config.text_config.num_hidden_layers = model_config["n_layer_text"] - config.vision_config.num_hidden_layers = model_config["n_layer_vision"] + # config.text_config.num_hidden_layers = model_config["n_layer_text"] + # config.vision_config.num_hidden_layers = model_config["n_layer_vision"] config._attn_implementation = "eager" config.vision_config.use_flash_attn = "false" model_hf = AutoModelForImageTextToText.from_pretrained( model_config["model_path"], low_cpu_mem_usage=False, config=config ) - elif model_config["model_name"] == "HuggingFaceTB/SmolVLM-256M-Instruct": - config = AutoConfig.from_pretrained(model_config["model_path"]) - config.text_config.num_hidden_layers = model_config["n_layer_text"] - config.vision_config.num_hidden_layers = model_config["n_layer_vision"] - config._attn_implementation = "eager" - config.vision_config.use_flash_attn = "false" - model_hf = AutoModelForVision2Seq.from_pretrained( - model_config["model_path"], low_cpu_mem_usage=False, config=config - ) + params = sum(p.numel() for p in model_hf.parameters()) model_hf.eval() return model_hf, params - -def generate_hf_inputs_smol(model_name, model, processor): - image = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") - messages = [ - {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Can you describe this image?"}]}, - ] - # Prepare inputs - prompt = processor.apply_chat_template(messages, add_generation_prompt=True) - inputs = processor(text=prompt, images=[image], return_tensors="pt") - return inputs - - def generate_hf_inputs_intern(model_name, model, processor): - ## PREPROCESSING THE MULTI-MODAL INPUTS pixel_values = [] for i in range(1, 2): url = f"https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-{i}-2048.jpg" @@ -113,7 +92,6 @@ def generate_hf_inputs_llava(model_name, model, processor=None): add_generation_prompt=True, ) inputs = processor(images=img, text=prompt, return_tensors="pt") - # inputs["processor"] = processor return inputs @@ -124,7 +102,6 @@ def generate_hf_inputs_llava(model_name, model, processor=None): generate_hf_inputs_func_map = { "llava-hf/llava-1.5-7b-hf": generate_hf_inputs_llava, "OpenGVLab/InternVL2_5-1B": generate_hf_inputs_intern, - "HuggingFaceTB/SmolVLM-256M-Instruct": generate_hf_inputs_smol, } @@ -189,7 +166,6 @@ def check_vlm_pytorch_vs_kv_vs_ort_vs_ai100( mxfp6=False, aic_enable_depth_first=False, ) - qeff_model.qpc_path = qpc_path qeff_model.generate(inputs, streamer, device_ids=None, runtime_ai100=True)