diff --git a/tests/e2e/vLLM/configs/fp8_dynamic_per_token_qwen.yaml b/tests/e2e/vLLM/configs/fp8_dynamic_per_token_qwen.yaml new file mode 100644 index 000000000..1fd375380 --- /dev/null +++ b/tests/e2e/vLLM/configs/fp8_dynamic_per_token_qwen.yaml @@ -0,0 +1,4 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen2.5-0.5B +scheme: FP8_DYNAMIC \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act_qwen.yaml b/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act_qwen.yaml new file mode 100644 index 000000000..b3aef5380 --- /dev/null +++ b/tests/e2e/vLLM/configs/int8_tensor_weight_static_per_tensor_act_qwen.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen2.5-0.5B +recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_tensor_weight_static_per_tensor_act.yaml +dataset_id: garage-bAInd/Open-Platypus +dataset_split: train +scheme: W8A8_tensor_weight_static_per_tensor_act diff --git a/tests/e2e/vLLM/configs/w4a16_actorder_group_qwen.yaml b/tests/e2e/vLLM/configs/w4a16_actorder_group_qwen.yaml new file mode 100644 index 000000000..1d892b7a9 --- /dev/null +++ b/tests/e2e/vLLM/configs/w4a16_actorder_group_qwen.yaml @@ -0,0 +1,8 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen2.5-0.5B +recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_group.yaml +dataset_id: neuralmagic/LLM_compression_calibration +dataset_split: train +scheme: W4A16_actorder_group +save_dir: Qwen2.5-0.5B-actorder-group \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/w4a16_actorder_weight_qwen.yaml b/tests/e2e/vLLM/configs/w4a16_actorder_weight_qwen.yaml new file mode 100644 index 000000000..cd0222572 --- /dev/null +++ b/tests/e2e/vLLM/configs/w4a16_actorder_weight_qwen.yaml @@ -0,0 +1,8 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen2.5-0.5B +recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml +dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected +dataset_split: train +scheme: W4A16_actorder_weight +save_dir: Qwen2.5-0.5B-actorder-weight \ No newline at end of file diff --git a/tests/e2e/vLLM/configs/w4a16_channel_quant_qwen.yaml b/tests/e2e/vLLM/configs/w4a16_channel_quant_qwen.yaml new file mode 100644 index 000000000..a8c2cd0a3 --- /dev/null +++ b/tests/e2e/vLLM/configs/w4a16_channel_quant_qwen.yaml @@ -0,0 +1,7 @@ +cadence: "nightly" +test_type: "regression" +model: Qwen/Qwen2.5-0.5B +scheme: W4A16_channel +dataset_id: Open-Orca/slimorca-deduped-cleaned-corrected +dataset_split: train +recipe: tests/e2e/vLLM/recipes/WNA16/recipe_w4a16_channel_quant.yaml \ No newline at end of file diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py index d233f5ee1..24c0a060d 100644 --- a/tests/e2e/vLLM/test_vllm.py +++ b/tests/e2e/vLLM/test_vllm.py @@ -25,6 +25,7 @@ HF_MODEL_HUB_NAME = "nm-testing" TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", "") +SKIP_HF_UPLOAD = os.environ.get("SKIP_HF_UPLOAD", "") EXPECTED_SAVED_FILES = [ "config.json", @@ -128,21 +129,23 @@ def test_vllm(self): fp.write(recipe_yaml_str) session.reset() - logger.info("================= UPLOADING TO HUB ======================") + if SKIP_HF_UPLOAD.lower() != "yes": - stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e" + logger.info("================= UPLOADING TO HUB ======================") - self.api.create_repo( - repo_id=stub, - exist_ok=True, - repo_type="model", - private=False, - ) + stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e" - self.api.upload_folder( - repo_id=stub, - folder_path=self.save_dir, - ) + self.api.create_repo( + repo_id=stub, + exist_ok=True, + repo_type="model", + private=False, + ) + + self.api.upload_folder( + repo_id=stub, + folder_path=self.save_dir, + ) logger.info("================= RUNNING vLLM =========================") diff --git a/tests/testing_utils.py b/tests/testing_utils.py index a6103a73c..257506784 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -135,7 +135,8 @@ def preprocess_tokenize_dataset( :param tokenizer: tokenizer to be used for tokenization :param max_seq_length: maximum sequence length of samples """ - if ds.info.dataset_name == "gsm8k": + ds_name = ds.info.dataset_name.lower() + if ds_name == "gsm8k": def preprocess(example): return example @@ -148,7 +149,8 @@ def tokenize(sample): truncation=True, add_special_tokens=False, ) - elif ds.info.dataset_name == "ultrachat_200k": + + elif ds_name == "ultrachat_200k": def preprocess(example): return { @@ -166,6 +168,69 @@ def tokenize(sample): truncation=True, add_special_tokens=False, ) + + elif ds_name == "llm_compression_calibration": + + def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["text"], + tokenize=False, + ) + } + + def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=max_seq_length, + truncation=True, + add_special_tokens=False, + ) + + elif ds_name == "open-platypus": + # use the output rather than the instruction + def preprocess(example): + return { + "text": tokenizer.apply_chat_template( + example["output"], + tokenize=False, + ) + } + + def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=max_seq_length, + truncation=True, + add_special_tokens=False, + ) + + elif ds_name == "slimorca-deduped-cleaned-corrected": + # find the first element corresponding to a message from a human + def preprocess(example): + conversation_idx = 0 + for idx, conversation in enumerate(example["conversations"]): + if conversation["from"] == "human": + conversation_idx = idx + break + return { + "text": tokenizer.apply_chat_template( + example["conversations"][conversation_idx]["value"], + tokenize=False, + ) + } + + def tokenize(sample): + return tokenizer( + sample["text"], + padding=False, + max_length=max_seq_length, + truncation=True, + add_special_tokens=False, + ) + else: raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")