Update PatchedVLLMKVCache for deepseek performance (#2165)

mengniwang95 · web-flow · commit fcf303148211 · 2025-04-05T15:55:55.000+08:00
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -1082,41 +1082,31 @@ def forward_qdq(self, input, *args, **kwargs):
         output_cache = self.orig_mod(qinput, *args, **kwargs)
         return output_cache
 
-    # def forward_quant(self, input, *args, **kwargs):
-    #     qinput = self.quant_input(input)
-    #     output_cache = self.orig_mod(qinput, *args, **kwargs)
-    #     return self.dequant_output(output_cache)
+    def forward_quant(self, input, *args, **kwargs):
+        qinput = self.quant_input(input)
+        output_cache = self.orig_mod(qinput, *args, **kwargs)
+        return self.dequant_output(output_cache)
 
     def forward_measure(self, input, *args, **kwargs):
         measure_input((input, ), self._mod_extra_config.inputs)
         output_cache = self.orig_mod(input, *args, **kwargs)
         measure_output((output_cache, ), self._mod_extra_config.outputs)
         return output_cache
 
-    # def fetch_from_cache(self, cache, blocks, permutations=None):
-    #     # quant_cache = self.quant_input(cache)
-    #     quant_cache = cache
-    #     if permutations:
-    #         output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations)
-    #         for i in range(len(output_cache)):
-    #             output_cache[i] = self.dequant_output(output_cache[i])
-    #         return output_cache
-    #     output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks)
-    #     return self.dequant_output(output_cache)
-    
-    def forward_quant(self, input, *args, **kwargs):
-        qinput = self.quant_input(input)
-        return self.orig_mod(qinput, *args, **kwargs)
-
-    def fetch_from_cache(self, quant_cache, blocks, permutations=None):
+    def fetch_from_cache(self, cache, blocks, permutations=None):
+        # TODO: Remove this workaround in next release [SW-221595]
+        if cache.dtype != self.lp_dtype:
+            quant_cache = self.quant_input(cache)
+        else:
+            quant_cache = cache
         if permutations:
             output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations)
             for i in range(len(output_cache)):
                 output_cache[i] = self.dequant_output(output_cache[i])
             return output_cache
         output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks)
         return self.dequant_output(output_cache)
-    
+
     def extra_repr(self) -> str:
         return f"PatchedVLLMKVCache"