diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 0e58409c8..0821d5585 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -136,5 +136,5 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then
 fi
 (
   set -x
-  $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0"
+  $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.7" psutil=="6.0.0"
 )
diff --git a/install/requirements.txt b/install/requirements.txt
index bd1e09174..73be68763 100644
--- a/install/requirements.txt
+++ b/install/requirements.txt
@@ -34,4 +34,4 @@ streamlit
 flask
 
 # eval
-lm_eval==0.4.2
+lm_eval==0.4.7
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index f40936b81..fcc2d5f66 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -794,4 +794,4 @@ def tokenizer_setting_to_name(tiktoken: bool, tokenizers: bool) -> str:
         return "TikToken"
     if tokenizers:
         return "Tokenizers"
-    return "SentencePiece"
+    return "SentencePiece"
\ No newline at end of file
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index f6bf32e40..7fd02eed3 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -432,6 +432,14 @@ def _add_evaluation_args(parser) -> None:
         help="Maximum length sequence to evaluate",
     )
 
+    eval_parser.add_argument(
+        "--modality",
+        type=str,
+        default="text",
+        choices=["text", "text-image"],
+        help="Modality of the model. Options: text, text-image",
+    )
+
 
 # Add CLI Args related to distributed inference
 # This feature is currently a [WIP] and hidden from --help
diff --git a/torchchat/model.py b/torchchat/model.py
index ce7dcb5e4..9722ca240 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -608,6 +608,12 @@ def setup_caches(self, batch_size, dtype, encoder_max_seq_len, decoder_max_seq_l
             decoder_max_seq_len=decoder_max_seq_len,
         )
 
+    def caches_are_setup(self) -> bool:
+        return self.model.caches_are_setup()
+
+    def caches_are_enabled(self) -> bool:
+        return self.model.caches_are_enabled()
+
     def reset_caches(self):
         self.model.reset_caches()
 
diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py
index b708e5840..882b650d0 100644
--- a/torchchat/usages/eval.py
+++ b/torchchat/usages/eval.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import argparse
-from typing import Callable, Optional
+from typing import Callable, Dict, List, Optional, Literal
 
 import torch
 import torch._dynamo.config
@@ -30,7 +30,10 @@
 
 import lm_eval
 
+import PIL
+
 from lm_eval.evaluator import evaluate
+from lm_eval.models.hf_vlms import HFMultimodalLM
 from lm_eval.models.huggingface import HFLM as eval_wrapper
 from lm_eval.tasks import get_task_dict
 
@@ -89,7 +92,7 @@ def __init__(
         device="cpu",
         is_pte_model: bool = False,
     ):
-        super().__init__(device=device)
+        super().__init__(pretrained="gpt2", device=device)
         self._model = model
         self._model_forward = (
             model_forward
@@ -168,6 +171,275 @@ def _model_generate(self, context, max_length, eos_token_id):
         raise Exception("unimplemented")
 
 
+class VLMEvalWrapper(HFMultimodalLM):
+    """
+    This class is adapted from torchtune.
+    Source: https://github.com/pytorch/torchtune/blob/main/recipes/eleuther_eval.py
+    -------------------------------------------------------------------------------
+
+    An EvalWrapper for EleutherAI's eval harness based on gpt-fast's
+    EvalWrapper: https://github.com/pytorch-labs/gpt-fast/blob/main/eval.py.
+
+    Note:
+        This is ONLY for vision-language models.
+
+    Args:
+        model (DeepFusionModel): The VLM to evaluate.
+        transform (Transform): The transform (tokenizer) to use for preprocessing.
+        device (torch.device): The device to use.
+        max_seq_length (int): The maximum sequence length.
+        batch_size (int): The batch size.
+        dtype (torch.dtype): dtype for the model caches during generation.
+        enable_kv_cache (bool): Whether to enable KV cache for generation.
+        image_tag (str): The string to use for the image token. Default is "<image>", which
+            is the default used by the MMMU dataset.
+        max_images_per_sample (int): The maximum number of images per sample. Defaults to
+            the max number of images in MMMU.
+    """
+
+
+    def __init__(
+        self,
+        model: Model,
+        transform,
+        *,
+        device: torch.device,
+        max_seq_length: int = 4096,
+        batch_size: int = 1,
+        dtype: torch.dtype = torch.bfloat16,
+        enable_kv_cache: bool = True,
+        # TODO (@joecummings): Update these defaults once more multimodal
+        # tasks are added to the eval harness
+        image_tag: str = "<image>",
+        max_images_per_sample: int = 7,
+    ):
+        # Having the imports here allow running other evals without installing torchtune
+        from torchtune.utils import batch_to_device
+        from torchtune.data import (
+            format_content_with_images,
+            left_pad_sequence,
+            Message,
+            padded_collate_tiled_images_and_mask,
+        )
+        from torchtune.generation import generate, sample
+        from torchtune.modules.common_utils import local_kv_cache
+        self.batch_to_device = batch_to_device
+        self.format_content_with_images = format_content_with_images
+        self.left_pad_sequence = left_pad_sequence
+        self.Message = Message
+        self.padded_collate_tiled_images_and_mask = padded_collate_tiled_images_and_mask
+        self.generate = generate
+        self.sample = sample
+        self.local_kv_cache = local_kv_cache
+
+        self._model = model
+        self._transform = transform
+        self._device = device
+        self._max_seq_length = max_seq_length
+        self._batch_size = batch_size
+        self._dtype = dtype
+        # Defaulting KV cache to True for multimodal
+        self._enable_kv_cache = True
+        self._image_tag = image_tag
+        self._max_images_per_sample = max_images_per_sample
+        self.times = []
+
+    @property
+    def model(self):
+        # Not actually changing the dtype here, just adding it as a
+        # property on the model
+        self._model.dtype = self._dtype
+        return self._model
+
+    @property
+    def model_transform(self):
+        return self._transform
+
+    @property
+    def device(self):
+        return self._device
+
+    @property
+    def cache_hook(self):
+        # Dummy class to appease the Harness
+        class DummyCacheHook:
+            def __init__(self):
+                self.add_partial = lambda x, y, z: True
+
+        return DummyCacheHook()
+
+    @property
+    def rank(self):
+        # Hardcoded for now b/c we only support single GPU eval
+        return 0
+
+    @property
+    def world_size(self):
+        # Hardcoded for now b/c we only support single GPU eval
+        return 1
+
+    @property
+    def batch_size(self):
+        return self._batch_size
+
+    @property
+    def eos_token_id(self):
+        return self._transform.tokenizer.eos_id
+
+    @property
+    def eot_token_id(self):
+        return self._transform.tokenizer.eot_id
+
+    @property
+    def max_length(self):
+        return self._max_seq_length
+
+    @property
+    def truncation(self):
+        return True
+
+    def tok_encode(self, string, **kwargs) -> List[int]:
+        # This is only used to get a number of tokens for use in sorting samples in dataset
+        # These values will not actually be used for eval
+        return self._transform.tokenizer.encode(string, add_bos=False, add_eos=False)
+
+    def tok_decode(self, tokens, skip_special_tokens=True) -> str:
+        if isinstance(tokens, int):
+            tokens = [tokens]
+        return self._transform.tokenizer.decode(
+            tokens, skip_special_tokens=skip_special_tokens
+        )
+
+    def tok_batch_multimodal_encode(
+        self,
+        all_texts: List[str],
+        all_images: List[List[PIL.Image.Image]],
+        left_truncate_len: int = None,
+        *args,
+        **kwargs,
+    ):
+        # Eleuther already parses out the text and images, so we just need to get
+        # it into a Message format for our tokenizer
+        all_encoded_messages = []
+
+        for text, images in zip(all_texts, all_images):
+            # Ensure images are all RGB
+            proper_images = []
+            for image in images:
+                if image.mode != "RGB":
+                    image = image.convert("RGB")
+                proper_images.append(image)
+
+            # Construct the messages
+            messages = []
+            content = self.format_content_with_images(
+                text, image_tag=self._image_tag, images=proper_images
+            )
+            messages.append(self.Message(role="user", content=content))
+            messages.append(self.Message(role="assistant", content=""))
+
+            # Transform the messages
+            tok_batch = self.model_transform({"messages": messages}, inference=True)
+            all_encoded_messages.append(tok_batch)
+
+        # Pad the encoded messages
+        tok_batch = self.padded_collate_tiled_images_and_mask(
+            all_encoded_messages,
+            pad_direction="left",
+            pad_max_images=self._max_images_per_sample,
+            pad_max_tiles=self._transform.max_num_tiles,
+        )
+        self.batch_to_device(tok_batch, self.device)
+
+        # Convert the batch to the format expected by the HF
+        tok_batch["input_ids"] = tok_batch.pop("tokens")
+
+        # the harness will use left_truncate_len to indicate that the current batch
+        # needs to be truncated to self.max_seq_len - self.max_gen_toks
+        if left_truncate_len is not None:
+            tok_batch["input_ids"] = tok_batch["input_ids"][:, -left_truncate_len:]
+
+        return tok_batch
+
+    @torch.inference_mode()
+    def _model_multimodal_generate(
+        self,
+        batch: Dict[str, torch.Tensor],
+        max_length: int,
+        stop: List[str],
+        **generation_kwargs,
+    ):
+        # 1. Validate inputs
+        prompt = batch.pop("input_ids")
+        bsz, seq_len = prompt.shape
+
+        temperature = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", False)
+        if do_sample or temperature != 0.0:
+            raise RuntimeError(
+                "Any decoding strategy other than greedy is not supported."
+            )
+
+        if bsz > 1:
+            raise ValueError(
+                f"Got a batch size of '{bsz}'. Batch size > 1 is not yet supported for "
+                "multimodal generation."
+            )
+
+        encoder_max_seq_len = (
+            self.model_transform.image_seq_len * self._max_images_per_sample
+        )
+        # Setup masks for bsz 1
+        with self.device:
+            causal_mask = torch.tril(
+                torch.ones(
+                    size=(self.max_length, self.max_length),
+                    dtype=torch.bool,
+                )
+            )
+            input_pos = torch.arange(self.max_length)
+
+        batch["input_pos"] = input_pos[None, :seq_len]
+        batch["mask"] = causal_mask[None, :seq_len]
+
+        with measure_time(message=None) as measure:
+            # 2. Setup KV cache
+            with self.local_kv_cache(
+                self.model,
+                batch_size=self.batch_size,
+                device=self.device,
+                dtype=self._dtype,
+                encoder_max_seq_len=encoder_max_seq_len,
+                decoder_max_seq_len=self.max_length,
+            ):
+                # 3. Prefill step
+                generated_tokens = []
+                logits = self.model(prompt, **batch)[:, -1]
+                token = self.sample(logits, temperature=0.0, top_k=None)
+                generated_tokens.append(token.item())
+
+                cache_mask = batch["encoder_mask"][:, -1:]
+
+                # 4. Continue generating
+                for _ in range(max_length):
+                    if token.item() in self.model_transform.stop_tokens:
+                        break
+                    logits = self.model(
+                        token,
+                        mask=causal_mask[None, seq_len, None, :],
+                        encoder_input=None,
+                        encoder_mask=cache_mask,
+                        input_pos=input_pos[None, seq_len],
+                    )[:, -1]
+                    token = self.sample(logits, temperature=0.0, top_k=None)
+                    generated_tokens.append(token.item())
+                    seq_len += 1
+        self.times.append(measure.get_time())
+
+        # 5. Return generated tokens
+        return torch.tensor(generated_tokens, dtype=torch.int32).unsqueeze(0)
+
+
 @torch.no_grad()
 def eval(
     model: Model,
@@ -178,6 +450,7 @@ def eval(
     max_seq_length: Optional[int] = None,
     device: str = "cpu",
     is_pte_model: bool = False,
+    modality: Literal["text", "text-image"] = "text",
 ) -> dict:
     """
     Evaluates a language model on a specified task using the lm-evaluation-harness library.
@@ -188,21 +461,39 @@ def eval(
         tasks (Optional[list]): The names of the evaluation tasks to perform.
         limit (Optional[int]): The maximum number of samples to evaluate (None for all available).
         max_seq_length (Optional[int]): The maximum sequence length allowed for input text.
+        modality (str): The modality of the model. Options: text, text-image
 
     Returns:
         eval_results (dict): A dictionary of evaluation results for the specified task(s).
     """
-    if tasks is None:
-        tasks = ["wikitext"]
 
-    model_eval_wrapper = GPTFastEvalWrapper(
-        model,
-        tokenizer,
-        model_forward=model_forward,
-        max_seq_length=max_seq_length,
-        device=device,
-        is_pte_model=is_pte_model,
-    )
+    if tasks is None:
+        if modality == "text":
+            tasks = ["wikitext"]
+        elif modality == "text-image":
+            tasks = ["mmmu-val-art"]
+
+    if modality == "text":
+        model_eval_wrapper = GPTFastEvalWrapper(
+            model,
+            tokenizer,
+            model_forward=model_forward,
+            max_seq_length=max_seq_length,
+            device=device,
+            is_pte_model=is_pte_model,
+        )
+        # use eot_token_id as prefix_token_id.
+        model_eval_wrapper.custom_prefix_token_id = model_eval_wrapper.eot_token_id
+    elif modality == "text-image":
+        from torchtune.utils import get_device
+        from torchtune.models.llama3_2_vision import llama3_2_vision_transform
+
+        model_eval_wrapper = VLMEvalWrapper(
+            model,
+            transform=llama3_2_vision_transform(path=str(tokenizer.tokenizer_path)), 
+            max_seq_length = 4096 if max_seq_length is None else max_seq_length,
+            device = get_device(device) if isinstance(device, str) else device,
+        )
 
     try:
         lm_eval.tasks.initialize_tasks()
@@ -243,11 +534,14 @@ def main(args) -> None:
     limit = args.limit
     compile = args.compile
     max_seq_length = args.max_seq_length
+    modality = args.modality
+
 
     print(f"Using device={device}")
     set_precision(builder_args.precision)
 
     tokenizer = _initialize_tokenizer(tokenizer_args)
+    tokenizer.tokenizer_path = tokenizer_args.tokenizer_path
     builder_args.setup_caches = False
     model = _initialize_model(
         builder_args,
@@ -260,12 +554,17 @@ def main(args) -> None:
 
     if compile:
         assert not (
-            builder_args.dso_path or builder_args.pte_path or builder_args.aoti_package_path
+            builder_args.dso_path
+            or builder_args.pte_path
+            or builder_args.aoti_package_path
         ), "cannot compile exported model"
         model_forward = torch.compile(
             model_forward, mode="reduce-overhead", dynamic=True, fullgraph=True
         )
-        torch._inductor.config.coordinate_descent_tuning = False if device == "cpu" else True
+        torch._inductor.config.coordinate_descent_tuning = (
+            False if device == "cpu" else True
+        )
+
 
     with measure_time("Time to run eval: {time:.02f}s."):
         result = eval(
@@ -277,6 +576,7 @@ def main(args) -> None:
             max_seq_length,
             device=builder_args.device,
             is_pte_model=builder_args.pte_path is not None,
+            modality=modality,
         )
 
     times = torch.tensor(result["times"])