Added support for Multimodal eval (#1499)

anirudhs001 · anirudh · Jack-Khuu · web-flow · commit dcc82b914bf0 · 2025-03-24T11:49:36.000-07:00
* [wip] Added cli args and other changes to eval multi-modal models

* remove redundant comment

* Added Llama3VisionTransform in TokenizerArgs and other changes

* use kv caching and other minor fixes

* default batch size 1

* lint eval.py and builder.py

* lm-eval 0.4.2-&gt;0.4.7 in install_requirements.sh

* fixes from code review

* remove modality from builder args

* use custom prefix token

* move torchtune imports inside VLMEvalWrapper

* revert changes from builder.py

* instantiate transform in eval()

---------

Co-authored-by: anirudh &lt;anirudhsingh@adobe.com&gt;
Co-authored-by: Jack-Khuu &lt;jack.khuu.7@gmail.com&gt;
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
@@ -136,5 +136,5 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then
 fi
 (
   set -x
-  $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0"
+  $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.7" psutil=="6.0.0"
 )
diff --git a/install/requirements.txt b/install/requirements.txt
@@ -34,4 +34,4 @@ streamlit
 flask
 
 # eval
-lm_eval==0.4.2
+lm_eval==0.4.7
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -794,4 +794,4 @@ def tokenizer_setting_to_name(tiktoken: bool, tokenizers: bool) -> str:
         return "TikToken"
     if tokenizers:
         return "Tokenizers"
-    return "SentencePiece"
+    return "SentencePiece"
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
@@ -432,6 +432,14 @@ def _add_evaluation_args(parser) -> None:
         help="Maximum length sequence to evaluate",
     )
 
+    eval_parser.add_argument(
+        "--modality",
+        type=str,
+        default="text",
+        choices=["text", "text-image"],
+        help="Modality of the model. Options: text, text-image",
+    )
+
 
 # Add CLI Args related to distributed inference
 # This feature is currently a [WIP] and hidden from --help
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -608,6 +608,12 @@ def setup_caches(self, batch_size, dtype, encoder_max_seq_len, decoder_max_seq_l
             decoder_max_seq_len=decoder_max_seq_len,
         )
 
+    def caches_are_setup(self) -> bool:
+        return self.model.caches_are_setup()
+
+    def caches_are_enabled(self) -> bool:
+        return self.model.caches_are_enabled()
+
     def reset_caches(self):
         self.model.reset_caches()
 
diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py

Original file line number	Diff line number	Diff line change
`@@ -136,5 +136,5 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then`
`136`	`136`	`fi`
`137`	`137`	`(`
`138`	`138`	`set -x`
`139`		`- $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0"`
	`139`	`+ $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.7" psutil=="6.0.0"`
`140`	`140`	`)`