Fix Glm4vMoeIntegrationTest (#40930)

ydshieh · web-flow · commit ecc1d778ce5f · 2025-09-17T18:21:18.000+02:00
* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh &lt;ydshieh@users.noreply.github.com&gt;
diff --git a/tests/models/glm4v_moe/test_modeling_glm4v_moe.py b/tests/models/glm4v_moe/test_modeling_glm4v_moe.py
@@ -14,7 +14,6 @@
 """Testing suite for the PyTorch GLM-4.1V model."""
 
 import copy
-import gc
 import unittest
 
 from transformers import (
@@ -25,9 +24,11 @@
     is_torch_available,
 )
 from transformers.testing_utils import (
+    cleanup,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
+    run_first,
     slow,
     torch_device,
 )
@@ -295,8 +296,26 @@ def test_inputs_embeds_matches_input_ids(self):
 
 @require_torch
 class Glm4vMoeIntegrationTest(unittest.TestCase):
+    model = None
+
+    @classmethod
+    def get_model(cls):
+        if cls.model is None:
+            cls.model = Glm4vMoeForConditionalGeneration.from_pretrained(
+                "zai-org/GLM-4.5V", dtype="auto", device_map="auto"
+            )
+        return cls.model
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.model
+        cleanup(torch_device, gc_collect=True)
+
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V")
+        cleanup(torch_device, gc_collect=True)
+        self.processor = AutoProcessor.from_pretrained(
+            "zai-org/GLM-4.5V", size={"shortest_edge": 10800, "longest_edge": 10800}
+        )
         self.message = [
             {
                 "role": "user",
@@ -321,130 +340,56 @@ def setUp(self):
                 ],
             }
         ]
+        self.message_wo_image = [
+            {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
+        ]
+
+        question = "Describe this video."
+        video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
+        self.video_messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_url,
+                    },
+                    {"type": "text", "text": question},
+                ],
+            }
+        ]
 
     def tearDown(self):
-        gc.collect()
-        torch.cuda.empty_cache()
+        cleanup(torch_device, gc_collect=True)
 
     @slow
     def test_small_model_integration_test(self):
-        model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
-
         inputs = self.processor.apply_chat_template(
             self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
         )
-        expected_input_ids = [151331, 151333, 151336, 198, 151339, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343, 151343]  # fmt: skip
+        expected_input_ids = [151331, 151333, 151336, 198, 151339, 151363, 151363, 151363, 151363, 151363, 151363, 151340, 3838, 3093, 315, 5562, 374]  # fmt: skip
         assert expected_input_ids == inputs.input_ids[0].tolist()[:17]
 
         expected_pixel_slice = torch.tensor(
             [
-                [-0.0988, -0.0842, -0.0842],
-                [-0.5660, -0.5514, -0.4200],
-                [-0.0259, -0.0259, -0.0259],
-                [-0.1280, -0.0988, -0.2010],
-                [-0.4638, -0.5806, -0.6974],
-                [-1.2083, -1.2229, -1.2083],
+                [-0.1134, -0.4492, -0.8580],
+                [-0.6244, -1.1645, -0.7120],
+                [-0.3324, -0.7996, -0.7120],
+                [0.2077, 0.2223, 0.4121],
+                [0.4413, 0.1931, 0.4559],
+                [0.5873, 0.3099, 0.4851],
             ],
             dtype=torch.float32,
             device="cpu",
         )
-        assert torch.allclose(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=3e-3)
-
-        # verify generation
-        inputs = inputs.to(torch_device)
-
-        output = model.generate(**inputs, max_new_tokens=30)
-        EXPECTED_DECODED_TEXT = "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
-        self.assertEqual(
-            self.processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
+        torch.testing.assert_close(expected_pixel_slice, inputs.pixel_values[:6, :3], atol=1e-4, rtol=1e-4)
 
     @slow
     def test_small_model_integration_test_batch(self):
-        model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
-        batch_messages = [self.message] * 2
+        model = self.get_model()
+        batch_messages = [self.message, self.message2, self.message_wo_image]
         inputs = self.processor.apply_chat_template(
-            batch_messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
-        ).to(torch_device)
-
-        # it should not matter whether two images are the same size or not
-        output = model.generate(**inputs, max_new_tokens=30)
-
-        EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
-        ]  # fmt: skip
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    def test_small_model_integration_test_with_video(self):
-        processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176})
-        model = Glm4vMoeForConditionalGeneration.from_pretrained(
-            "zai-org/GLM-4.5V", dtype=torch.float16, device_map="auto"
-        )
-        questions = ["Describe this video."] * 2
-        video_urls = [
-            "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
-        ] * 2
-        messages = [
-            [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "video",
-                            "video": video_url,
-                        },
-                        {"type": "text", "text": question},
-                    ],
-                }
-            ]
-            for question, video_url in zip(questions, video_urls)
-        ]
-        inputs = processor.apply_chat_template(
-            messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", padding=True
-        ).to(torch_device)
-        output = model.generate(**inputs, max_new_tokens=30)
-        EXPECTED_DECODED_TEXT = [
-            "\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami",
-            "\n012345Describe this video.\n<think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami"
-        ]  # fmt: skip
-        self.assertEqual(
-            processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    def test_small_model_integration_test_expand(self):
-        model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
-        inputs = self.processor.apply_chat_template(
-            self.message, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt"
-        ).to(torch_device)
-
-        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2)
-
-        EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically"
-        ]  # fmt: skip
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    def test_small_model_integration_test_batch_wo_image(self):
-        model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
-        message_wo_image = [
-            {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
-        ]
-        batched_messages = [self.message, message_wo_image]
-        inputs = self.processor.apply_chat_template(
-            batched_messages,
+            batch_messages,
             tokenize=True,
             add_generation_prompt=True,
             return_dict=True,
@@ -453,42 +398,43 @@ def test_small_model_integration_test_batch_wo_image(self):
         ).to(torch_device)
 
         # it should not matter whether two images are the same size or not
-        output = model.generate(**inputs, max_new_tokens=30)
+        output = model.generate(**inputs, max_new_tokens=10)
 
         EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            '\nWho are you?\n<think>Got it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\'m an AI assistant'
+            "\nWhat kind of dog is this?\n<think>Got it, let's try to figure out",
+            "\nWhat kind of dog is this?\n<think>Got it, let's see. The user",
+            '\nWho are you?\n<think>The user is asking "Who are you?"'
         ]  # fmt: skip
+        decoded = self.processor.batch_decode(output, skip_special_tokens=True)
+        decoded = [x.replace("<|image|>", "") for x in decoded]
         self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
+            decoded,
             EXPECTED_DECODED_TEXT,
         )
 
     @slow
-    def test_small_model_integration_test_batch_different_resolutions(self):
-        model = Glm4vMoeForConditionalGeneration.from_pretrained("zai-org/GLM-4.5V", dtype="auto", device_map="auto")
-        batched_messages = [self.message, self.message2]
-        inputs = self.processor.apply_chat_template(
-            batched_messages,
+    def test_small_model_integration_test_with_video(self):
+        processor = AutoProcessor.from_pretrained("zai-org/GLM-4.5V", max_image_size={"longest_edge": 50176})
+        model = self.get_model()
+        batch_messages = [self.video_messages]
+        inputs = processor.apply_chat_template(
+            batch_messages,
             tokenize=True,
             add_generation_prompt=True,
             return_dict=True,
             return_tensors="pt",
             padding=True,
         ).to(torch_device)
-
-        # it should not matter whether two images are the same size or not
-        output = model.generate(**inputs, max_new_tokens=30)
-
-        EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
-        ]  # fmt: skip
+        output = model.generate(**inputs, max_new_tokens=3)
+        EXPECTED_DECODED_TEXT = ["\n012345Describe this video.\n<think>Got it"]  # fmt: skip
+        decoded = processor.batch_decode(output, skip_special_tokens=True)
+        decoded = [x.replace("<|image|>", "") for x in decoded]
         self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
+            decoded,
             EXPECTED_DECODED_TEXT,
         )
 
+    @run_first
     @slow
     @require_flash_attn
     @require_torch_gpu
@@ -499,44 +445,9 @@ def test_small_model_integration_test_batch_flashatt2(self):
             attn_implementation="flash_attention_2",
             device_map="auto",
         )
-        batched_messages = [self.message, self.message2]
-        inputs = self.processor.apply_chat_template(
-            batched_messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt",
-            padding=True,
-        ).to(torch_device)
-
-        # it should not matter whether two images are the same size or not
-        output = model.generate(**inputs, max_new_tokens=30)
-
-        EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture has a stocky build, thick fur, and a face that's",
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
-        ]  # fmt: skip
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    @require_flash_attn
-    @require_torch_gpu
-    def test_small_model_integration_test_batch_wo_image_flashatt2(self):
-        model = Glm4vMoeForConditionalGeneration.from_pretrained(
-            "zai-org/GLM-4.5V",
-            dtype=torch.bfloat16,
-            attn_implementation="flash_attention_2",
-            device_map="auto",
-        )
-        message_wo_image = [
-            {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]},
-        ]
-        batched_messages = [self.message, message_wo_image]
+        batch_messages = [self.message, self.message2, self.message_wo_image]
         inputs = self.processor.apply_chat_template(
-            batched_messages,
+            batch_messages,
             tokenize=True,
             add_generation_prompt=True,
             return_dict=True,
@@ -545,14 +456,16 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self):
         ).to(torch_device)
 
         # it should not matter whether two images are the same size or not
-        output = model.generate(**inputs, max_new_tokens=30)
+        output = model.generate(**inputs, max_new_tokens=3)
 
         EXPECTED_DECODED_TEXT = [
-            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-            '\nWho are you?\n<think>Got it, let\'s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI'
+            "\nWhat kind of dog is this?\n<think>Got it",
+            "\nWhat kind of dog is this?\n<think>Got it",
+            "\nWho are you?\n<think>The user",
         ]  # fmt: skip
-
+        decoded = self.processor.batch_decode(output, skip_special_tokens=True)
+        decoded = [x.replace("<|image|>", "") for x in decoded]
         self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
+            decoded,
             EXPECTED_DECODED_TEXT,
         )