[TRTLLM-6577][feat] Support nano_v2_vlm in pytorch backend

Wanli-Jiang · Wanli-Jiang · commit b92762a107f5 · 2025-09-17T18:25:24.000-07:00
* update notes that nano v2 vlm cannot support kvcache reuse.
* update codes according to reviewers' comments.

Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/docs/source/legacy/reference/multimodal-feature-support-matrix.md b/docs/source/legacy/reference/multimodal-feature-support-matrix.md
@@ -8,7 +8,7 @@
 | LLaVA-NeXT         | Yes        | Yes                 | Yes            | Yes             |
 | Llama 4            | Yes        | Yes                 | No             | No              |
 | Mistral-Small-3.1  | Yes        | Yes                 | No             | No              |
-| Nano-v2-VLM        | Yes        | Yes                 | Yes            | No              |
+| Nano-v2-VLM        | Yes        | Yes                 | No             | No              |
 | Phi-4-multimodal   | Yes        | Yes                 | No             | No              |
 | Qwen2-VL           | Yes        | Yes                 | Yes            | Yes             |
 | Qwen2.5-VL         | Yes        | Yes                 | Yes            | Yes             |
diff --git a/tensorrt_llm/_torch/models/modeling_nanov2vlm.py b/tensorrt_llm/_torch/models/modeling_nanov2vlm.py
@@ -33,6 +33,9 @@ def _is_disagg() -> bool:
 
 # TODO: update the reference config path once Nano v2 VLM is released.
 IMAGE_TOKEN_ID = 131072
+IMG_CONTEXT_TOKEN = "<image>"
+IMG_START_TOKEN = "<img>"
+IMG_END_TOKEN = "</img>"
 
 
 class SquaredReLU(nn.Module):
@@ -41,8 +44,7 @@ def forward(self, x):
         return torch.pow(torch.nn.functional.relu(x), 2)
 
 
-class NanoV2VLVisionEncoder(transformers.PreTrainedModel,
-                            transformers.generation.GenerationMixin):
+class NanoV2VLVisionEncoder(transformers.PreTrainedModel):
 
     def __init__(self,
                  model_config: ModelConfig[transformers.PretrainedConfig]):
@@ -61,20 +63,21 @@ def __init__(self,
         self.llm_hidden_size = config.llm_config.hidden_size
         self.mlp1 = nn.Sequential(
             nn.RMSNorm(self.vit_hidden_size * int(1 / self.downsample_ratio)**2,
-                       eps=config.llm_config.rms_norm_eps),
+                       eps=config.llm_config.rms_norm_eps,
+                       dtype=config.torch_dtype),
             nn.Linear(self.vit_hidden_size * int(1 / self.downsample_ratio)**2,
                       self.vision_projection_hidden_size,
-                      bias=False), SquaredReLU(),
+                      bias=False,
+                      dtype=config.torch_dtype), SquaredReLU(),
             nn.Linear(self.vision_projection_hidden_size,
                       self.llm_hidden_size,
-                      bias=False))
-        self.mlp1 = self.mlp1.to(config.torch_dtype)
+                      bias=False,
+                      dtype=config.torch_dtype))
 
         # Construct the vision encoder.
         vision_model_config = copy.deepcopy(model_config)
         vision_model_config.pretrained_config = vision_model_config.pretrained_config.vision_config
         self.vision_model = RADIOVisionModel(vision_model_config)
-        self.vision_model.to(config.torch_dtype)
 
     def load_weights(self, weights):
         # Load mlp1 weights.
@@ -111,7 +114,6 @@ def pixel_shuffle(self, x, scale_factor=0.5):
 
     def extract_feature(self, pixel_values):
         vit_embeds = self.vision_model(pixel_values)
-        vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
         # Down-sampling and projection.
         h = w = int(vit_embeds.shape[1]**0.5)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
@@ -131,11 +133,11 @@ def forward(self, multimodal_params: List[MultimodalParams]):
         ],
                                          dim=0)
         # -> [num_patches, channel, height, width]
-        batched_num_patches = torch.cat([
+        patch_list = [
             multimodal_param.multimodal_data["num_patches"]
             for multimodal_param in multimodal_params
-        ],
-                                        dim=0).tolist()
+        ]
+        batched_num_patches = torch.cat(patch_list, dim=0).tolist()
         # -> list of[num_patches1, num_patches2, ...]
         batched_image_embeds = self.extract_feature(batched_pixel_values)
         # -> [num_patches, num_image_token, hidden_size]
@@ -176,9 +178,10 @@ def __init__(self,
 
         self.processor = transformers.AutoImageProcessor.from_pretrained(
             model_path, trust_remote_code=True, use_fast=self.use_fast)
-        self.img_context_token = "<image>"
-        self.img_start_token = "<img>"
-        self.img_end_token = "</img>"
+
+        self.img_context_token = IMG_CONTEXT_TOKEN
+        self.img_start_token = IMG_START_TOKEN
+        self.img_end_token = IMG_END_TOKEN
         self.dtype = model_config.torch_dtype
 
     def get_vocab_size(self):
@@ -194,7 +197,7 @@ def get_num_tokens_per_image(
         **kwargs,
     ):
 
-        def get_internvl_target_ratios(
+        def _get_internvl_target_ratios(
             min_num: int,
             max_num: int,
         ) -> list[tuple[int, int]]:
@@ -205,8 +208,8 @@ def get_internvl_target_ratios(
                              if min_num <= i * j <= max_num}
             return sorted(target_ratios, key=lambda x: x[0] * x[1])
 
-        def find_closest_aspect_ratio(aspect_ratio, target_ratios, width,
-                                      height, image_size):
+        def _find_closest_aspect_ratio(aspect_ratio, target_ratios, width,
+                                       height, image_size):
             best_factor = float('-inf')
             best_ratio = (1, 1)
             area = width * height
@@ -221,7 +224,7 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width,
                     best_ratio = ratio
             return best_ratio
 
-        def calculate_targets(
+        def _calculate_targets(
             orig_width: int,
             orig_height: int,
             target_ratios: list[tuple[int, int]],
@@ -230,7 +233,7 @@ def calculate_targets(
             aspect_ratio = orig_width / orig_height
 
             # find the closest aspect ratio to the target
-            target_aspect_ratio = find_closest_aspect_ratio(
+            target_aspect_ratio = _find_closest_aspect_ratio(
                 aspect_ratio,
                 target_ratios,
                 width=orig_width,
@@ -243,10 +246,10 @@ def calculate_targets(
 
         image_height = image.height
         image_width = image.width
-        target_ratios = get_internvl_target_ratios(1,
-                                                   self.processor.max_num_tiles)
-        blocks = calculate_targets(image_width, image_height, target_ratios,
-                                   self.image_size)
+        target_ratios = _get_internvl_target_ratios(
+            1, self.processor.max_num_tiles)
+        blocks = _calculate_targets(image_width, image_height, target_ratios,
+                                    self.image_size)
         if self.processor.use_thumbnail and blocks != 1:
             blocks += 1
         num_image_tokens = self.num_image_token * blocks
@@ -309,7 +312,7 @@ def __call__(
     model_type="NemotronH_Nano_VL_V2",
     placeholder_metadata=MultimodalPlaceholderMetadata(
         placeholder_map={
-            "image": "<image>",
+            "image": IMG_CONTEXT_TOKEN,
         },
         placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
         placeholders_separator="",
@@ -332,7 +335,6 @@ def __init__(self, model_config: ModelConfig):
 
         if not _is_disagg():
             self.vision_encoder = NanoV2VLVisionEncoder(model_config).eval()
-            self.vision_encoder.to(config.torch_dtype)
 
         llm_model_config = copy.deepcopy(model_config)
         llm_model_config.pretrained_config = llm_model_config.pretrained_config.llm_config