Skip to content

Commit a2ac0b0

Browse files
fix oom (#1335)
* decrease batch to 2 * half data size * get batch size based on vram * fix bench not found * fix 'int' object is not callable * [CI] share GPU default false * batch 1 * keep cache
1 parent dc59d74 commit a2ac0b0

File tree

7 files changed

+47
-39
lines changed

7 files changed

+47
-39
lines changed

.github/workflows/unit_tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ on:
4646
description: 'one test, one gpu. for collecting statistics'
4747
type: boolean
4848
required: false
49-
default: false
49+
default: true
5050
server:
5151
description: 'Choose server (zen4 or xeon5)'
5252
required: true
@@ -661,7 +661,7 @@ jobs:
661661
- name: Clean cache
662662
if: always()
663663
run: |
664-
rm ~/.cache/evalplus/*pkl || true
664+
# rm ~/.cache/evalplus/*pkl || true
665665
pip cache purge && uv cache clean && rm -rf ./* ./.*
666666
667667
show-statistics:

tests/models/model_test.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
import sys
2020
from typing import Dict, List
2121

22+
from device_smi import Device
23+
24+
from gptqmodel.models._const import CUDA_0
25+
2226
if sys.platform == "darwin":
2327
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
2428
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
@@ -131,9 +135,12 @@ def load_tokenizer(self, model_id_or_path, trust_remote_code=False):
131135
return tokenizer
132136

133137
@classmethod
134-
def load_dataset(self, tokenizer, rows: int = DATASET_SIZE):
138+
def load_dataset(self, tokenizer=None, rows: int = DATASET_SIZE):
135139
traindata = load_dataset("json", data_files="/monster/data/model/dataset/c4-train.00000-of-01024.json.gz", split="train")
136140

141+
if not tokenizer:
142+
return traindata.select(range(rows))
143+
137144
datas = []
138145
for index, sample in enumerate(traindata):
139146
tokenized = tokenizer(sample['text'])
@@ -369,3 +376,6 @@ def clear_directory(self, directory_path):
369376
os.unlink(item_path)
370377
elif os.path.isdir(item_path):
371378
shutil.rmtree(item_path)
379+
380+
def get_batch_size(self):
381+
return 32 if Device(CUDA_0).memory_total / 1024 / 1024 / 1024 > 24 else 2

tests/test_eval.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,12 @@
2525

2626
from gptqmodel import GPTQModel # noqa: E402
2727
from gptqmodel.utils.eval import EVAL # noqa: E402
28+
from models.model_test import ModelTest # noqa: E402
2829
from lm_eval.tasks import TaskManager # noqa: E402
2930
from parameterized import parameterized # noqa: E402
3031

3132

32-
class TestEval(unittest.TestCase):
33+
class TestEval(ModelTest):
3334
@classmethod
3435
def setUpClass(self):
3536
self.MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1"
@@ -54,7 +55,7 @@ def test_eval_gptqmodel(self, framework: Union[Type[EVAL.LM_EVAL],Type[EVAL.EVAL
5455
results = GPTQModel.eval(model_or_id_or_path=self.MODEL_ID,
5556
framework=framework,
5657
tasks=[task],
57-
batch_size=8,
58+
batch_size=1,
5859
output_path=output_path,
5960
llm_backend=llm_backend,
6061
model_args=model_args,

tests/test_post_quant_eora.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def test_post_quant_eora(self):
7676
desc_act = True
7777
rank = 256
7878
batch_size = 1
79-
calibration_dataset_rows = 1024
79+
calibration_dataset_rows = 512
8080
calibration_dataset_concat_size = 0 # disable
8181
auto_gc = False
8282
adapter_file_name = "eora.safetensors"
@@ -93,11 +93,7 @@ def test_post_quant_eora(self):
9393
"adapter_file_name": adapter_file_name,
9494
}
9595

96-
calibration_dataset = load_dataset(
97-
"allenai/c4",
98-
data_files="en/c4-train.00001-of-01024.json.gz",
99-
split="train"
100-
).select(range(calibration_dataset_rows))["text"]
96+
calibration_dataset = self.load_dataset(rows=calibration_dataset_rows)["text"]
10197

10298
with tempfile.TemporaryDirectory() as tmpdir:
10399
eora = Lora(

tests/test_quant_and_eora.py

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,30 +32,6 @@
3232
from tabulate import tabulate # noqa: E402
3333

3434

35-
def bench(path: str, backend: BACKEND, adapter: Optional[Lora]):
36-
# test post-quant inference
37-
model = GPTQModel.load(
38-
model_id_or_path=path,
39-
backend=backend,
40-
adapter=adapter,
41-
)
42-
43-
tokens = model.generate("Capital of France is")[0]
44-
result = model.tokenizer.decode(tokens)
45-
print(f"BACKEND: {backend}, Result: {result}")
46-
assert "paris" in result.lower(), f"`paris` not found in `{result}`"
47-
48-
bench_result = GPTQModel.eval(
49-
model_or_id_or_path=model,
50-
framework=EVAL.LM_EVAL,
51-
tasks=[EVAL.LM_EVAL.ARC_CHALLENGE, EVAL.LM_EVAL.MMLU],
52-
batch_size=32,
53-
)
54-
55-
del model
56-
torch_empty_cache()
57-
58-
return bench_result
5935

6036
class Test(ModelTest):
6137
NATIVE_MODEL_ID = "/monster/data/model/Qwen2.5-0.5B-Instruct/"
@@ -140,8 +116,8 @@ def test_quant_and_eora(self):
140116

141117
# BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA,
142118
for backend in [ BACKEND.MARLIN ]: # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN
143-
base_bench = bench(path=tmpdir, backend=backend, adapter=None) # inference using qweights only
144-
eora_bench = bench(path=tmpdir, backend=backend, adapter=eora) # inference using eora (lora)
119+
base_bench = self.bench(path=tmpdir, backend=backend, adapter=None) # inference using qweights only
120+
eora_bench = self.bench(path=tmpdir, backend=backend, adapter=eora) # inference using eora (lora)
145121

146122
print('--------GPTQModel + EoRA Config ---------')
147123

@@ -158,3 +134,28 @@ def test_quant_and_eora(self):
158134
print(make_table(eora_bench))
159135
if "groups" in eora_bench:
160136
print(make_table(eora_bench, "groups"))
137+
138+
def bench(self, path: str, backend: BACKEND, adapter: Optional[Lora]):
139+
# test post-quant inference
140+
model = GPTQModel.load(
141+
model_id_or_path=path,
142+
backend=backend,
143+
adapter=adapter,
144+
)
145+
146+
tokens = model.generate("Capital of France is")[0]
147+
result = model.tokenizer.decode(tokens)
148+
print(f"BACKEND: {backend}, Result: {result}")
149+
assert "paris" in result.lower(), f"`paris` not found in `{result}`"
150+
151+
bench_result = GPTQModel.eval(
152+
model_or_id_or_path=model,
153+
framework=EVAL.LM_EVAL,
154+
tasks=[EVAL.LM_EVAL.ARC_CHALLENGE, EVAL.LM_EVAL.MMLU],
155+
batch_size=self.get_batch_size(),
156+
)
157+
158+
del model
159+
torch_empty_cache()
160+
161+
return bench_result

tests/test_quant_formats.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def test_quantize(self, method: QUANT_METHOD, backend: BACKEND, sym: bool, forma
7878
self.pretrained_model_id,
7979
quantize_config=quantize_config,
8080
)
81-
model.quantize(self.calibration_dataset, batch_size=32)
81+
model.quantize(self.calibration_dataset, batch_size=self.get_batch_size())
8282

8383
with tempfile.TemporaryDirectory() as tmpdirname:
8484
model.save(tmpdirname)

tests/test_quant_formats_auto_round.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def test_quantize(self, method: QUANT_METHOD, backend: BACKEND, sym: bool, forma
7676
self.pretrained_model_id,
7777
quantize_config=quantize_config,
7878
)
79-
model.quantize(self.calibration_dataset, batch_size=32)
79+
model.quantize(self.calibration_dataset, batch_size=self.get_batch_size())
8080

8181
with tempfile.TemporaryDirectory() as tmpdirname:
8282
model.save(tmpdirname)

0 commit comments

Comments
 (0)