diff --git a/install/install_requirements.sh b/install/install_requirements.sh index 0e58409c8..0821d5585 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -136,5 +136,5 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then fi ( set -x - $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0" + $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.7" psutil=="6.0.0" ) diff --git a/install/requirements.txt b/install/requirements.txt index bd1e09174..73be68763 100644 --- a/install/requirements.txt +++ b/install/requirements.txt @@ -34,4 +34,4 @@ streamlit flask # eval -lm_eval==0.4.2 +lm_eval==0.4.7 diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py index f40936b81..fcc2d5f66 100644 --- a/torchchat/cli/builder.py +++ b/torchchat/cli/builder.py @@ -794,4 +794,4 @@ def tokenizer_setting_to_name(tiktoken: bool, tokenizers: bool) -> str: return "TikToken" if tokenizers: return "Tokenizers" - return "SentencePiece" + return "SentencePiece" \ No newline at end of file diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py index f6bf32e40..7fd02eed3 100644 --- a/torchchat/cli/cli.py +++ b/torchchat/cli/cli.py @@ -432,6 +432,14 @@ def _add_evaluation_args(parser) -> None: help="Maximum length sequence to evaluate", ) + eval_parser.add_argument( + "--modality", + type=str, + default="text", + choices=["text", "text-image"], + help="Modality of the model. Options: text, text-image", + ) + # Add CLI Args related to distributed inference # This feature is currently a [WIP] and hidden from --help diff --git a/torchchat/model.py b/torchchat/model.py index ce7dcb5e4..9722ca240 100644 --- a/torchchat/model.py +++ b/torchchat/model.py @@ -608,6 +608,12 @@ def setup_caches(self, batch_size, dtype, encoder_max_seq_len, decoder_max_seq_l decoder_max_seq_len=decoder_max_seq_len, ) + def caches_are_setup(self) -> bool: + return self.model.caches_are_setup() + + def caches_are_enabled(self) -> bool: + return self.model.caches_are_enabled() + def reset_caches(self): self.model.reset_caches() diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py index b708e5840..882b650d0 100644 --- a/torchchat/usages/eval.py +++ b/torchchat/usages/eval.py @@ -4,7 +4,7 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import argparse -from typing import Callable, Optional +from typing import Callable, Dict, List, Optional, Literal import torch import torch._dynamo.config @@ -30,7 +30,10 @@ import lm_eval +import PIL + from lm_eval.evaluator import evaluate +from lm_eval.models.hf_vlms import HFMultimodalLM from lm_eval.models.huggingface import HFLM as eval_wrapper from lm_eval.tasks import get_task_dict @@ -89,7 +92,7 @@ def __init__( device="cpu", is_pte_model: bool = False, ): - super().__init__(device=device) + super().__init__(pretrained="gpt2", device=device) self._model = model self._model_forward = ( model_forward @@ -168,6 +171,275 @@ def _model_generate(self, context, max_length, eos_token_id): raise Exception("unimplemented") +class VLMEvalWrapper(HFMultimodalLM): + """ + This class is adapted from torchtune. + Source: https://github.com/pytorch/torchtune/blob/main/recipes/eleuther_eval.py + ------------------------------------------------------------------------------- + + An EvalWrapper for EleutherAI's eval harness based on gpt-fast's + EvalWrapper: https://github.com/pytorch-labs/gpt-fast/blob/main/eval.py. + + Note: + This is ONLY for vision-language models. + + Args: + model (DeepFusionModel): The VLM to evaluate. + transform (Transform): The transform (tokenizer) to use for preprocessing. + device (torch.device): The device to use. + max_seq_length (int): The maximum sequence length. + batch_size (int): The batch size. + dtype (torch.dtype): dtype for the model caches during generation. + enable_kv_cache (bool): Whether to enable KV cache for generation. + image_tag (str): The string to use for the image token. Default is "", which + is the default used by the MMMU dataset. + max_images_per_sample (int): The maximum number of images per sample. Defaults to + the max number of images in MMMU. + """ + + + def __init__( + self, + model: Model, + transform, + *, + device: torch.device, + max_seq_length: int = 4096, + batch_size: int = 1, + dtype: torch.dtype = torch.bfloat16, + enable_kv_cache: bool = True, + # TODO (@joecummings): Update these defaults once more multimodal + # tasks are added to the eval harness + image_tag: str = "", + max_images_per_sample: int = 7, + ): + # Having the imports here allow running other evals without installing torchtune + from torchtune.utils import batch_to_device + from torchtune.data import ( + format_content_with_images, + left_pad_sequence, + Message, + padded_collate_tiled_images_and_mask, + ) + from torchtune.generation import generate, sample + from torchtune.modules.common_utils import local_kv_cache + self.batch_to_device = batch_to_device + self.format_content_with_images = format_content_with_images + self.left_pad_sequence = left_pad_sequence + self.Message = Message + self.padded_collate_tiled_images_and_mask = padded_collate_tiled_images_and_mask + self.generate = generate + self.sample = sample + self.local_kv_cache = local_kv_cache + + self._model = model + self._transform = transform + self._device = device + self._max_seq_length = max_seq_length + self._batch_size = batch_size + self._dtype = dtype + # Defaulting KV cache to True for multimodal + self._enable_kv_cache = True + self._image_tag = image_tag + self._max_images_per_sample = max_images_per_sample + self.times = [] + + @property + def model(self): + # Not actually changing the dtype here, just adding it as a + # property on the model + self._model.dtype = self._dtype + return self._model + + @property + def model_transform(self): + return self._transform + + @property + def device(self): + return self._device + + @property + def cache_hook(self): + # Dummy class to appease the Harness + class DummyCacheHook: + def __init__(self): + self.add_partial = lambda x, y, z: True + + return DummyCacheHook() + + @property + def rank(self): + # Hardcoded for now b/c we only support single GPU eval + return 0 + + @property + def world_size(self): + # Hardcoded for now b/c we only support single GPU eval + return 1 + + @property + def batch_size(self): + return self._batch_size + + @property + def eos_token_id(self): + return self._transform.tokenizer.eos_id + + @property + def eot_token_id(self): + return self._transform.tokenizer.eot_id + + @property + def max_length(self): + return self._max_seq_length + + @property + def truncation(self): + return True + + def tok_encode(self, string, **kwargs) -> List[int]: + # This is only used to get a number of tokens for use in sorting samples in dataset + # These values will not actually be used for eval + return self._transform.tokenizer.encode(string, add_bos=False, add_eos=False) + + def tok_decode(self, tokens, skip_special_tokens=True) -> str: + if isinstance(tokens, int): + tokens = [tokens] + return self._transform.tokenizer.decode( + tokens, skip_special_tokens=skip_special_tokens + ) + + def tok_batch_multimodal_encode( + self, + all_texts: List[str], + all_images: List[List[PIL.Image.Image]], + left_truncate_len: int = None, + *args, + **kwargs, + ): + # Eleuther already parses out the text and images, so we just need to get + # it into a Message format for our tokenizer + all_encoded_messages = [] + + for text, images in zip(all_texts, all_images): + # Ensure images are all RGB + proper_images = [] + for image in images: + if image.mode != "RGB": + image = image.convert("RGB") + proper_images.append(image) + + # Construct the messages + messages = [] + content = self.format_content_with_images( + text, image_tag=self._image_tag, images=proper_images + ) + messages.append(self.Message(role="user", content=content)) + messages.append(self.Message(role="assistant", content="")) + + # Transform the messages + tok_batch = self.model_transform({"messages": messages}, inference=True) + all_encoded_messages.append(tok_batch) + + # Pad the encoded messages + tok_batch = self.padded_collate_tiled_images_and_mask( + all_encoded_messages, + pad_direction="left", + pad_max_images=self._max_images_per_sample, + pad_max_tiles=self._transform.max_num_tiles, + ) + self.batch_to_device(tok_batch, self.device) + + # Convert the batch to the format expected by the HF + tok_batch["input_ids"] = tok_batch.pop("tokens") + + # the harness will use left_truncate_len to indicate that the current batch + # needs to be truncated to self.max_seq_len - self.max_gen_toks + if left_truncate_len is not None: + tok_batch["input_ids"] = tok_batch["input_ids"][:, -left_truncate_len:] + + return tok_batch + + @torch.inference_mode() + def _model_multimodal_generate( + self, + batch: Dict[str, torch.Tensor], + max_length: int, + stop: List[str], + **generation_kwargs, + ): + # 1. Validate inputs + prompt = batch.pop("input_ids") + bsz, seq_len = prompt.shape + + temperature = generation_kwargs.get("temperature", 0.0) + do_sample = generation_kwargs.get("do_sample", False) + if do_sample or temperature != 0.0: + raise RuntimeError( + "Any decoding strategy other than greedy is not supported." + ) + + if bsz > 1: + raise ValueError( + f"Got a batch size of '{bsz}'. Batch size > 1 is not yet supported for " + "multimodal generation." + ) + + encoder_max_seq_len = ( + self.model_transform.image_seq_len * self._max_images_per_sample + ) + # Setup masks for bsz 1 + with self.device: + causal_mask = torch.tril( + torch.ones( + size=(self.max_length, self.max_length), + dtype=torch.bool, + ) + ) + input_pos = torch.arange(self.max_length) + + batch["input_pos"] = input_pos[None, :seq_len] + batch["mask"] = causal_mask[None, :seq_len] + + with measure_time(message=None) as measure: + # 2. Setup KV cache + with self.local_kv_cache( + self.model, + batch_size=self.batch_size, + device=self.device, + dtype=self._dtype, + encoder_max_seq_len=encoder_max_seq_len, + decoder_max_seq_len=self.max_length, + ): + # 3. Prefill step + generated_tokens = [] + logits = self.model(prompt, **batch)[:, -1] + token = self.sample(logits, temperature=0.0, top_k=None) + generated_tokens.append(token.item()) + + cache_mask = batch["encoder_mask"][:, -1:] + + # 4. Continue generating + for _ in range(max_length): + if token.item() in self.model_transform.stop_tokens: + break + logits = self.model( + token, + mask=causal_mask[None, seq_len, None, :], + encoder_input=None, + encoder_mask=cache_mask, + input_pos=input_pos[None, seq_len], + )[:, -1] + token = self.sample(logits, temperature=0.0, top_k=None) + generated_tokens.append(token.item()) + seq_len += 1 + self.times.append(measure.get_time()) + + # 5. Return generated tokens + return torch.tensor(generated_tokens, dtype=torch.int32).unsqueeze(0) + + @torch.no_grad() def eval( model: Model, @@ -178,6 +450,7 @@ def eval( max_seq_length: Optional[int] = None, device: str = "cpu", is_pte_model: bool = False, + modality: Literal["text", "text-image"] = "text", ) -> dict: """ Evaluates a language model on a specified task using the lm-evaluation-harness library. @@ -188,21 +461,39 @@ def eval( tasks (Optional[list]): The names of the evaluation tasks to perform. limit (Optional[int]): The maximum number of samples to evaluate (None for all available). max_seq_length (Optional[int]): The maximum sequence length allowed for input text. + modality (str): The modality of the model. Options: text, text-image Returns: eval_results (dict): A dictionary of evaluation results for the specified task(s). """ - if tasks is None: - tasks = ["wikitext"] - model_eval_wrapper = GPTFastEvalWrapper( - model, - tokenizer, - model_forward=model_forward, - max_seq_length=max_seq_length, - device=device, - is_pte_model=is_pte_model, - ) + if tasks is None: + if modality == "text": + tasks = ["wikitext"] + elif modality == "text-image": + tasks = ["mmmu-val-art"] + + if modality == "text": + model_eval_wrapper = GPTFastEvalWrapper( + model, + tokenizer, + model_forward=model_forward, + max_seq_length=max_seq_length, + device=device, + is_pte_model=is_pte_model, + ) + # use eot_token_id as prefix_token_id. + model_eval_wrapper.custom_prefix_token_id = model_eval_wrapper.eot_token_id + elif modality == "text-image": + from torchtune.utils import get_device + from torchtune.models.llama3_2_vision import llama3_2_vision_transform + + model_eval_wrapper = VLMEvalWrapper( + model, + transform=llama3_2_vision_transform(path=str(tokenizer.tokenizer_path)), + max_seq_length = 4096 if max_seq_length is None else max_seq_length, + device = get_device(device) if isinstance(device, str) else device, + ) try: lm_eval.tasks.initialize_tasks() @@ -243,11 +534,14 @@ def main(args) -> None: limit = args.limit compile = args.compile max_seq_length = args.max_seq_length + modality = args.modality + print(f"Using device={device}") set_precision(builder_args.precision) tokenizer = _initialize_tokenizer(tokenizer_args) + tokenizer.tokenizer_path = tokenizer_args.tokenizer_path builder_args.setup_caches = False model = _initialize_model( builder_args, @@ -260,12 +554,17 @@ def main(args) -> None: if compile: assert not ( - builder_args.dso_path or builder_args.pte_path or builder_args.aoti_package_path + builder_args.dso_path + or builder_args.pte_path + or builder_args.aoti_package_path ), "cannot compile exported model" model_forward = torch.compile( model_forward, mode="reduce-overhead", dynamic=True, fullgraph=True ) - torch._inductor.config.coordinate_descent_tuning = False if device == "cpu" else True + torch._inductor.config.coordinate_descent_tuning = ( + False if device == "cpu" else True + ) + with measure_time("Time to run eval: {time:.02f}s."): result = eval( @@ -277,6 +576,7 @@ def main(args) -> None: max_seq_length, device=builder_args.device, is_pte_model=builder_args.pte_path is not None, + modality=modality, ) times = torch.tensor(result["times"])