|
15 | 15 |
|
16 | 16 | from __future__ import annotations |
17 | 17 |
|
18 | | -import copy |
19 | 18 | import json |
20 | 19 | import os |
21 | 20 | import shutil |
22 | 21 | import time |
23 | | -from typing import Any, Dict, List, Optional, Union, Tuple |
| 22 | +from typing import Any, Dict, List, Optional, Tuple, Union |
24 | 23 |
|
25 | 24 | import torch |
26 | 25 | import torch.nn as nn |
27 | 26 | from packaging import version |
28 | | -from torch import autocast |
29 | 27 | from transformers import AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizerBase, modeling_utils |
30 | 28 |
|
31 | 29 | from ..nn_modules.hooked_linear import replace_linear_with_hooked_linear |
|
36 | 34 | from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory |
37 | 35 | from ..utils.importer import select_quant_linear |
38 | 36 | from ..utils.logger import setup_logger |
39 | | -from ..utils.model import (MODALITY, check_to_quantized, find_layers, get_device, get_module_by_name_prefix, |
40 | | - get_moe_layer_modules, move_to, nested_move_to, normalize_tokenizer, pack_model, get_module) |
| 37 | +from ..utils.model import (MODALITY, check_to_quantized, find_layers, get_device, |
| 38 | + get_module, get_module_by_name_prefix, get_moe_layer_modules, |
| 39 | + move_to, nested_move_to, normalize_tokenizer, pack_model) |
41 | 40 | from ..utils.progress import ProgressBar |
42 | 41 | from ..utils.torch import torch_empty_cache |
43 | | -from ._const import CPU, DEVICE, CUDA, SUPPORTS_MODULE_TYPES |
| 42 | +from ._const import CPU, DEVICE, SUPPORTS_MODULE_TYPES |
44 | 43 | from .loader import ModelLoader |
45 | 44 | from .writer import (QUANT_LOG_DAMP, QUANT_LOG_FWD_TIME, QUANT_LOG_LAYER, |
46 | 45 | QUANT_LOG_LOSS, QUANT_LOG_MODULE, QUANT_LOG_TIME, ModelWriter) |
@@ -402,8 +401,8 @@ def collate_batch(batch): |
402 | 401 | tied_keys = self.model._tied_weights_keys |
403 | 402 | for item in tied_keys: |
404 | 403 | if self.lm_head in item: |
405 | | - raise NotImplementedError(f"quantizing lm_head with tied weights has not been supported " |
406 | | - f"currently") |
| 404 | + raise NotImplementedError("quantizing lm_head with tied weights has not been supported " |
| 405 | + "currently") |
407 | 406 |
|
408 | 407 | lm_head_module = get_module(self.model, key=self.lm_head) |
409 | 408 | if get_module(self.model, key=self.lm_head) is None: |
@@ -566,7 +565,7 @@ def store_lm_head_input_hook(_, args, kwargs): |
566 | 565 | for i in layer_pb: |
567 | 566 | is_lm_head = i >= layer_count |
568 | 567 | if is_lm_head: |
569 | | - layer_pb.set_description(f"Quantizing lm_head") |
| 568 | + layer_pb.set_description("Quantizing lm_head") |
570 | 569 | layer = get_module(self.model, key=self.lm_head) |
571 | 570 | if self.quantize_config.lm_head and not self.quantize_config.lm_head_low_gpu_mem_usage: |
572 | 571 | layer_inputs = lm_head_inputs |
|
0 commit comments