1414"""Testing suite for the PyTorch GLM-4.1V model."""
1515
1616import copy
17- import gc
1817import unittest
1918
2019from transformers import (
2524 is_torch_available ,
2625)
2726from transformers .testing_utils import (
27+ cleanup ,
2828 require_flash_attn ,
2929 require_torch ,
3030 require_torch_gpu ,
31+ run_first ,
3132 slow ,
3233 torch_device ,
3334)
@@ -295,8 +296,26 @@ def test_inputs_embeds_matches_input_ids(self):
295296
296297@require_torch
297298class Glm4vMoeIntegrationTest (unittest .TestCase ):
299+ model = None
300+
301+ @classmethod
302+ def get_model (cls ):
303+ if cls .model is None :
304+ cls .model = Glm4vMoeForConditionalGeneration .from_pretrained (
305+ "zai-org/GLM-4.5V" , dtype = "auto" , device_map = "auto"
306+ )
307+ return cls .model
308+
309+ @classmethod
310+ def tearDownClass (cls ):
311+ del cls .model
312+ cleanup (torch_device , gc_collect = True )
313+
298314 def setUp (self ):
299- self .processor = AutoProcessor .from_pretrained ("zai-org/GLM-4.5V" )
315+ cleanup (torch_device , gc_collect = True )
316+ self .processor = AutoProcessor .from_pretrained (
317+ "zai-org/GLM-4.5V" , size = {"shortest_edge" : 10800 , "longest_edge" : 10800 }
318+ )
300319 self .message = [
301320 {
302321 "role" : "user" ,
@@ -321,130 +340,56 @@ def setUp(self):
321340 ],
322341 }
323342 ]
343+ self .message_wo_image = [
344+ {"role" : "user" , "content" : [{"type" : "text" , "text" : "Who are you?" }]},
345+ ]
346+
347+ question = "Describe this video."
348+ video_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
349+ self .video_messages = [
350+ {
351+ "role" : "user" ,
352+ "content" : [
353+ {
354+ "type" : "video" ,
355+ "video" : video_url ,
356+ },
357+ {"type" : "text" , "text" : question },
358+ ],
359+ }
360+ ]
324361
325362 def tearDown (self ):
326- gc .collect ()
327- torch .cuda .empty_cache ()
363+ cleanup (torch_device , gc_collect = True )
328364
329365 @slow
330366 def test_small_model_integration_test (self ):
331- model = Glm4vMoeForConditionalGeneration .from_pretrained ("zai-org/GLM-4.5V" , dtype = "auto" , device_map = "auto" )
332-
333367 inputs = self .processor .apply_chat_template (
334368 self .message , tokenize = True , add_generation_prompt = True , return_dict = True , return_tensors = "pt"
335369 )
336- expected_input_ids = [151331 , 151333 , 151336 , 198 , 151339 , 151343 , 151343 , 151343 , 151343 , 151343 , 151343 , 151343 , 151343 , 151343 , 151343 , 151343 , 151343 ] # fmt: skip
370+ expected_input_ids = [151331 , 151333 , 151336 , 198 , 151339 , 151363 , 151363 , 151363 , 151363 , 151363 , 151363 , 151340 , 3838 , 3093 , 315 , 5562 , 374 ] # fmt: skip
337371 assert expected_input_ids == inputs .input_ids [0 ].tolist ()[:17 ]
338372
339373 expected_pixel_slice = torch .tensor (
340374 [
341- [- 0.0988 , - 0.0842 , - 0.0842 ],
342- [- 0.5660 , - 0.5514 , - 0.4200 ],
343- [- 0.0259 , - 0.0259 , - 0.0259 ],
344- [- 0.1280 , - 0.0988 , - 0.2010 ],
345- [- 0.4638 , - 0.5806 , - 0.6974 ],
346- [- 1.2083 , - 1.2229 , - 1.2083 ],
375+ [- 0.1134 , - 0.4492 , - 0.8580 ],
376+ [- 0.6244 , - 1.1645 , - 0.7120 ],
377+ [- 0.3324 , - 0.7996 , - 0.7120 ],
378+ [0.2077 , 0.2223 , 0.4121 ],
379+ [0.4413 , 0.1931 , 0.4559 ],
380+ [0.5873 , 0.3099 , 0.4851 ],
347381 ],
348382 dtype = torch .float32 ,
349383 device = "cpu" ,
350384 )
351- assert torch .allclose (expected_pixel_slice , inputs .pixel_values [:6 , :3 ], atol = 3e-3 )
352-
353- # verify generation
354- inputs = inputs .to (torch_device )
355-
356- output = model .generate (** inputs , max_new_tokens = 30 )
357- EXPECTED_DECODED_TEXT = "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
358- self .assertEqual (
359- self .processor .decode (output [0 ], skip_special_tokens = True ),
360- EXPECTED_DECODED_TEXT ,
361- )
385+ torch .testing .assert_close (expected_pixel_slice , inputs .pixel_values [:6 , :3 ], atol = 1e-4 , rtol = 1e-4 )
362386
363387 @slow
364388 def test_small_model_integration_test_batch (self ):
365- model = Glm4vMoeForConditionalGeneration . from_pretrained ( "zai-org/GLM-4.5V" , dtype = "auto" , device_map = "auto" )
366- batch_messages = [self .message ] * 2
389+ model = self . get_model ( )
390+ batch_messages = [self .message , self . message2 , self . message_wo_image ]
367391 inputs = self .processor .apply_chat_template (
368- batch_messages , tokenize = True , add_generation_prompt = True , return_dict = True , return_tensors = "pt"
369- ).to (torch_device )
370-
371- # it should not matter whether two images are the same size or not
372- output = model .generate (** inputs , max_new_tokens = 30 )
373-
374- EXPECTED_DECODED_TEXT = [
375- "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" ,
376- "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks"
377- ] # fmt: skip
378- self .assertEqual (
379- self .processor .batch_decode (output , skip_special_tokens = True ),
380- EXPECTED_DECODED_TEXT ,
381- )
382-
383- @slow
384- def test_small_model_integration_test_with_video (self ):
385- processor = AutoProcessor .from_pretrained ("zai-org/GLM-4.5V" , max_image_size = {"longest_edge" : 50176 })
386- model = Glm4vMoeForConditionalGeneration .from_pretrained (
387- "zai-org/GLM-4.5V" , dtype = torch .float16 , device_map = "auto"
388- )
389- questions = ["Describe this video." ] * 2
390- video_urls = [
391- "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4"
392- ] * 2
393- messages = [
394- [
395- {
396- "role" : "user" ,
397- "content" : [
398- {
399- "type" : "video" ,
400- "video" : video_url ,
401- },
402- {"type" : "text" , "text" : question },
403- ],
404- }
405- ]
406- for question , video_url in zip (questions , video_urls )
407- ]
408- inputs = processor .apply_chat_template (
409- messages , tokenize = True , add_generation_prompt = True , return_dict = True , return_tensors = "pt" , padding = True
410- ).to (torch_device )
411- output = model .generate (** inputs , max_new_tokens = 30 )
412- EXPECTED_DECODED_TEXT = [
413- "\n 012345Describe this video.\n <think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami" ,
414- "\n 012345Describe this video.\n <think>Got it, let's analyze the video. First, the scene is a room with a wooden floor, maybe a traditional Japanese room with tatami"
415- ] # fmt: skip
416- self .assertEqual (
417- processor .batch_decode (output , skip_special_tokens = True ),
418- EXPECTED_DECODED_TEXT ,
419- )
420-
421- @slow
422- def test_small_model_integration_test_expand (self ):
423- model = Glm4vMoeForConditionalGeneration .from_pretrained ("zai-org/GLM-4.5V" , dtype = "auto" , device_map = "auto" )
424- inputs = self .processor .apply_chat_template (
425- self .message , tokenize = True , add_generation_prompt = True , return_dict = True , return_tensors = "pt"
426- ).to (torch_device )
427-
428- output = model .generate (** inputs , max_new_tokens = 30 , do_sample = False , num_beams = 2 , num_return_sequences = 2 )
429-
430- EXPECTED_DECODED_TEXT = [
431- "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically" ,
432- "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically"
433- ] # fmt: skip
434- self .assertEqual (
435- self .processor .batch_decode (output , skip_special_tokens = True ),
436- EXPECTED_DECODED_TEXT ,
437- )
438-
439- @slow
440- def test_small_model_integration_test_batch_wo_image (self ):
441- model = Glm4vMoeForConditionalGeneration .from_pretrained ("zai-org/GLM-4.5V" , dtype = "auto" , device_map = "auto" )
442- message_wo_image = [
443- {"role" : "user" , "content" : [{"type" : "text" , "text" : "Who are you?" }]},
444- ]
445- batched_messages = [self .message , message_wo_image ]
446- inputs = self .processor .apply_chat_template (
447- batched_messages ,
392+ batch_messages ,
448393 tokenize = True ,
449394 add_generation_prompt = True ,
450395 return_dict = True ,
@@ -453,42 +398,43 @@ def test_small_model_integration_test_batch_wo_image(self):
453398 ).to (torch_device )
454399
455400 # it should not matter whether two images are the same size or not
456- output = model .generate (** inputs , max_new_tokens = 30 )
401+ output = model .generate (** inputs , max_new_tokens = 10 )
457402
458403 EXPECTED_DECODED_TEXT = [
459- "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" ,
460- '\n Who are you?\n <think>Got it, the user is asking "Who are you?" I need to respond appropriately. First, I should clarify that I\' m an AI assistant'
404+ "\n What kind of dog is this?\n <think>Got it, let's try to figure out" ,
405+ "\n What kind of dog is this?\n <think>Got it, let's see. The user" ,
406+ '\n Who are you?\n <think>The user is asking "Who are you?"'
461407 ] # fmt: skip
408+ decoded = self .processor .batch_decode (output , skip_special_tokens = True )
409+ decoded = [x .replace ("<|image|>" , "" ) for x in decoded ]
462410 self .assertEqual (
463- self . processor . batch_decode ( output , skip_special_tokens = True ) ,
411+ decoded ,
464412 EXPECTED_DECODED_TEXT ,
465413 )
466414
467415 @slow
468- def test_small_model_integration_test_batch_different_resolutions (self ):
469- model = Glm4vMoeForConditionalGeneration .from_pretrained ("zai-org/GLM-4.5V" , dtype = "auto" , device_map = "auto" )
470- batched_messages = [self .message , self .message2 ]
471- inputs = self .processor .apply_chat_template (
472- batched_messages ,
416+ def test_small_model_integration_test_with_video (self ):
417+ processor = AutoProcessor .from_pretrained ("zai-org/GLM-4.5V" , max_image_size = {"longest_edge" : 50176 })
418+ model = self .get_model ()
419+ batch_messages = [self .video_messages ]
420+ inputs = processor .apply_chat_template (
421+ batch_messages ,
473422 tokenize = True ,
474423 add_generation_prompt = True ,
475424 return_dict = True ,
476425 return_tensors = "pt" ,
477426 padding = True ,
478427 ).to (torch_device )
479-
480- # it should not matter whether two images are the same size or not
481- output = model .generate (** inputs , max_new_tokens = 30 )
482-
483- EXPECTED_DECODED_TEXT = [
484- "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" ,
485- "\n What kind of dog is this?\n <think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
486- ] # fmt: skip
428+ output = model .generate (** inputs , max_new_tokens = 3 )
429+ EXPECTED_DECODED_TEXT = ["\n 012345Describe this video.\n <think>Got it" ] # fmt: skip
430+ decoded = processor .batch_decode (output , skip_special_tokens = True )
431+ decoded = [x .replace ("<|image|>" , "" ) for x in decoded ]
487432 self .assertEqual (
488- self . processor . batch_decode ( output , skip_special_tokens = True ) ,
433+ decoded ,
489434 EXPECTED_DECODED_TEXT ,
490435 )
491436
437+ @run_first
492438 @slow
493439 @require_flash_attn
494440 @require_torch_gpu
@@ -499,44 +445,9 @@ def test_small_model_integration_test_batch_flashatt2(self):
499445 attn_implementation = "flash_attention_2" ,
500446 device_map = "auto" ,
501447 )
502- batched_messages = [self .message , self .message2 ]
503- inputs = self .processor .apply_chat_template (
504- batched_messages ,
505- tokenize = True ,
506- add_generation_prompt = True ,
507- return_dict = True ,
508- return_tensors = "pt" ,
509- padding = True ,
510- ).to (torch_device )
511-
512- # it should not matter whether two images are the same size or not
513- output = model .generate (** inputs , max_new_tokens = 30 )
514-
515- EXPECTED_DECODED_TEXT = [
516- "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture has a stocky build, thick fur, and a face that's" ,
517- "\n What kind of dog is this?\n <think>Got it, let's look at the image. Wait, the animals here are cats, not dogs. The question is about a dog, but"
518- ] # fmt: skip
519- self .assertEqual (
520- self .processor .batch_decode (output , skip_special_tokens = True ),
521- EXPECTED_DECODED_TEXT ,
522- )
523-
524- @slow
525- @require_flash_attn
526- @require_torch_gpu
527- def test_small_model_integration_test_batch_wo_image_flashatt2 (self ):
528- model = Glm4vMoeForConditionalGeneration .from_pretrained (
529- "zai-org/GLM-4.5V" ,
530- dtype = torch .bfloat16 ,
531- attn_implementation = "flash_attention_2" ,
532- device_map = "auto" ,
533- )
534- message_wo_image = [
535- {"role" : "user" , "content" : [{"type" : "text" , "text" : "Who are you?" }]},
536- ]
537- batched_messages = [self .message , message_wo_image ]
448+ batch_messages = [self .message , self .message2 , self .message_wo_image ]
538449 inputs = self .processor .apply_chat_template (
539- batched_messages ,
450+ batch_messages ,
540451 tokenize = True ,
541452 add_generation_prompt = True ,
542453 return_dict = True ,
@@ -545,14 +456,16 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self):
545456 ).to (torch_device )
546457
547458 # it should not matter whether two images are the same size or not
548- output = model .generate (** inputs , max_new_tokens = 30 )
459+ output = model .generate (** inputs , max_new_tokens = 3 )
549460
550461 EXPECTED_DECODED_TEXT = [
551- "\n What kind of dog is this?\n <think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks" ,
552- '\n Who are you?\n <think>Got it, let\' s look at the question. The user is asking "Who are you?" which is a common question when someone meets an AI'
462+ "\n What kind of dog is this?\n <think>Got it" ,
463+ "\n What kind of dog is this?\n <think>Got it" ,
464+ "\n Who are you?\n <think>The user" ,
553465 ] # fmt: skip
554-
466+ decoded = self .processor .batch_decode (output , skip_special_tokens = True )
467+ decoded = [x .replace ("<|image|>" , "" ) for x in decoded ]
555468 self .assertEqual (
556- self . processor . batch_decode ( output , skip_special_tokens = True ) ,
469+ decoded ,
557470 EXPECTED_DECODED_TEXT ,
558471 )
0 commit comments